html2markdown 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source 'http://rubygems.org'
2
+
3
+ group :development, :test do
4
+ gem 'rspec'
5
+ # Testing infrastructure
6
+ gem 'guard'
7
+ gem 'guard-rspec'
8
+
9
+ end
10
+
11
+ gem 'nokogiri'
12
+
13
+ # Specify your gem's dependencies in html2markdown.gemspec
14
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
data/Readme.md ADDED
@@ -0,0 +1,3 @@
1
+ ### Yet Another Html to Markdown ruby lib
2
+ we love markdown, cause it is friendly to edit
3
+ So we want everything to be markdown
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/html2markdown/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["mike lee"]
6
+ gem.email = ["mike.d.1984@gmail.com"]
7
+ gem.description = %q{simple and flexible html to markdown converter}
8
+ gem.summary = %q{simple and flexible html to markdown converter}
9
+ gem.homepage = ""
10
+
11
+ gem.add_dependency 'nokogiri'
12
+
13
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
14
+ gem.files = `git ls-files`.split("\n")
15
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ gem.name = "html2markdown"
17
+ gem.require_paths = ["lib"]
18
+ gem.version = HTML2Markdown::VERSION
19
+ end
@@ -0,0 +1,7 @@
1
+ require "html2markdown/version"
2
+ require_relative 'html2markdown/converter'
3
+ require_relative 'html2markdown/html_page'
4
+
5
+ module HTML2Markdown
6
+ # Your code goes here...
7
+ end
@@ -0,0 +1,83 @@
1
+ require 'nokogiri'
2
+ require_relative 'exceptions'
3
+
4
+ module HTML2Markdown
5
+ module Converter
6
+
7
+ def to_markdown string_contents
8
+ raise NoContents unless string_contents!=nil and string_contents.is_a?(String)
9
+ doc = Nokogiri::HTML(string_contents)
10
+ doc.children.map { |ele| parse_element(ele) }.join
11
+ end
12
+
13
+ # a normal element
14
+ # maybe text
15
+ # maybe node
16
+ def parse_element(ele)
17
+ if ele.is_a? Nokogiri::XML::Text
18
+ return "#{ele.text}\n"
19
+ else
20
+ if (children = ele.children).count > 0
21
+ return wrap_node(ele,children.map {|ele| parse_element(ele)}.join )
22
+ else
23
+ return wrap_node(ele,ele.text)
24
+ end
25
+ end
26
+ end
27
+
28
+ # wrap node with markdown
29
+ def wrap_node(node,contents=nil)
30
+ result = ''
31
+ contents.strip! unless contents==nil
32
+ # check if there is a custom parse exist
33
+ if respond_to? "parse_#{node.name}"
34
+ return self.send("parse_#{node.name}",node,contents)
35
+ end
36
+ # skip hidden node
37
+ return '' if node['style'] and node['style'] =~ /display:\s*none/
38
+ # default parse
39
+ case node.name.downcase
40
+ when 'i'
41
+ when 'li'
42
+ result << "*#{contents}\n"
43
+ when 'blockquote'
44
+ contents.split('\n').each do |part|
45
+ result << ">#{contents}\n"
46
+ end
47
+ when 'strong'
48
+ result << "**#{contents}**\n"
49
+ when 'h1'
50
+ result << "##{contents}\n"
51
+ when 'h2'
52
+ result << "###{contents}\n"
53
+ when 'h3'
54
+ result << "####{contents}\n"
55
+ when 'hr'
56
+ result << "****\n"
57
+ when 'br'
58
+ result << "\n"
59
+ when 'img'
60
+ result << "![#{node['alt']}](#{node['src']})"
61
+ when 'a'
62
+ result << "[#{contents}](#{node['href']})"
63
+ else
64
+ result << contents unless contents == nil
65
+ end
66
+ result
67
+ end
68
+
69
+ # define custom node processor
70
+ def method_missing(name,*args,&block)
71
+ self.class.send :define_method,"parse_#{name}" do |node,contents|
72
+ block.call node,contents
73
+ end
74
+ end
75
+
76
+ def debug
77
+ puts '----------------------------------'
78
+ puts yield
79
+ puts '----------------------------------'
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,3 @@
1
+ module HTML2Markdown
2
+ class NoContents < Exception;end
3
+ end
@@ -0,0 +1,18 @@
1
+ require_relative 'converter'
2
+ require 'open-uri'
3
+
4
+ class HTMLPage
5
+ include HTML2Markdown::Converter
6
+ attr_accessor :url,:contents
7
+
8
+ def initialize(options,&content_extrator)
9
+ @host = options[:host]
10
+ @url = options[:url]
11
+ if (@contents = options[:contents]) == nil
12
+ doc = Nokogiri::HTML(open(@url))
13
+ @contents = doc.at_css('body').send(:inner_html) || doc.inner_html
14
+ end
15
+ @content_extrator = content_extrator
16
+ end
17
+
18
+ end
@@ -0,0 +1,3 @@
1
+ module HTML2Markdown
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,77 @@
1
+ # coding:utf-8
2
+ require 'spec_helper'
3
+ require_relative '../../lib/html2markdown'
4
+
5
+ describe HTMLPage do
6
+
7
+ # it "should have url and contents property" do
8
+ # url = 'http://bbs.qyer.com/viewthread.php?tid=503325&extra=page%3D1'
9
+ # page = HTMLPage.new :url => url
10
+ # page.contents.should_not be_nil
11
+ # page.url.should == url
12
+ # end
13
+
14
+ it "can convert to markdow format" do
15
+ contents = <<-CON
16
+ \n<i class=\"pstatus\"> 本帖最后由 aria_lyy 于 2012-3-21 22:22 编辑 </i><br><br><strong>
17
+ <font color=\"Navy\"><font size=\"4\"><font size=\"5\">看贴前请先看这儿——如果您不喜欢旅游,
18
+ 不喜欢自助旅游,不喜欢动点脑筋玩儿的舒坦点,不喜欢花点时间挑选旅行时自己可心儿的衣食住行景,
19
+ 您就赶紧把这个页面关了吧,不然实在浪费您的时间和国家那所剩无几的电资源</font></font></font></strong>
20
+ <br><br><font color=\"Red\"><font size=\"5\">【前言】</font></font><br><br><strong>
21
+ <font size=\"4\"><font color=\"Navy\">【关于去旅游和不去旅游的理由】<br>\r\n1.如果你喜欢旅游尤其自助游,却有各种“不能去旅游的理由”请跳转至2、3、4、5、6、7!
22
+ <img src=\"images/smilies/default/lol.gif\" smilieid=\"12\" border=\"0\" alt=\"\"> 如果您没有那些讨厌的理由,请跳转至8!
23
+ <img src=\"images/smilies/default/loveliness.gif\" smilieid=\"28\" border=\"0\" alt=\"\"><br>\r\n2.如果你认为是钱阻挡了旅游的步伐?LZ也是个工薪阶层,也是每年辛苦工作,攒出旅游的钱;再者说,哪怕只走个国内游,也算开了眼界、见了世面,钱,不是问题,更何况有许多人坚持穷游概念,钱就更不是问题;<br>\r\n3.如果你认为是假期阻挡了旅游的步伐?LZ和内口子每年也只有7天年假,却仍能挤一挤,腾出10天左右的时间去趟国外逍遥一番;再者说,哪怕只是4天的海岛游,也算晒出个富豪脸~别说工作放不下,那是你根本不想去旅游;<br>\r\n4.如果你认为是语言阻挡了旅游的步伐?LZ2006年第一次去意大利、2009年第二次走意大利、走希腊、走法国,2010年去南非、马来西亚,2011年马耳他、意大利、马来西亚,再到今年这趟西班牙,从来!从来从来!没因为二把刀(也就徘徊在国家英语三到四级的边缘)不成句的英语迷路、饿肚子、丢东西丢人、误火车飞机大炮、买不成东西,关键我们去的很多都是非英语国家,经常会出现我们用英语跟别人交流,别人用叽里呱啦的母语跟我们交谈,然后解决一件特别难的大事儿~~GOOD!东特哇瑞,别担心沟通有问题,那不是旅行的重点!<br>\r\n5.没伙伴?不旅行?哇塞,你一定是孤独症患者;<br>\r\n6.爹妈不让?不旅行?哇塞,还是去街心花园吧;<br>\r\n7.懒得查这个那个?你如果愿意点开这个帖子,就说明你根本不懒~</font></font></strong><br><strong><font size=\"5\"><font color=\"DarkRed\">8.那就揣着护照,动身吧!!!!!</font></font></strong>\n
24
+ CON
25
+ page = HTMLPage.new :contents => contents
26
+ markdown = page.to_markdown page.contents
27
+ markdown.length.should > 0
28
+ end
29
+
30
+ it "can accept custom parse" do
31
+ contents = <<-CON
32
+ <table cellspacing="0" cellpadding="0"><tbody><tr><td class="t_msgfont" id="postmessage_4983254"><i class="pstatus"> 本帖最后由 theme_of_ryan 于 2011-12-29 22:21 编辑 </i><br>
33
+ <br>
34
+ <font size="4"><font face="微软雅黑 "><font size="3"><font face="微软雅黑 ">今年7月和几个<span href="tag.php?name=%E6%9C%8B%E5%8F%8B" onclick="tagshow(event)" class="t_tag">朋友</span>在吃饭漫无边际聊天的时候,突然提出今年12月去<span href="tag.php?name=%E6%B3%95%E5%9B%BD" onclick="tagshow(event)" class="t_tag">法国</span>和西班牙深度游的想法,当晚查询<span href="tag.php?name=%E6%9C%BA%E7%A5%A8" onclick="tagshow(event)" class="t_tag">机票</span>刚好遇到特价机票,毫不犹豫果断出票。去年我和男朋友自助游去过<span href="tag.php?name=%E6%84%8F%E5%A4%A7%E5%88%A9" onclick="tagshow(event)" class="t_tag">意大利</span>、克罗地亚和希腊22天(<strong><a href="http://bbs.go2eu.com/viewthread.php?tid=494274" target="_blank"><strong><font color="#0066cc">游记点这里</font></strong></a></strong>),所以做攻略的任务便落在了我身上。于是……我慢悠悠的开始准备阶段,现在接近完成阶段即开帖和大家分享当中的乐趣,把我最感兴趣、最关注的关键点写在帖子里,偶尔还需要和大家请教过程中遇到的问题。</font><font face="微软雅黑 "></font> <br>
35
+ <font face="微软雅黑 "><strong><font color="#ff00"></font></strong></font></font></font></font><br>
36
+ <font size="4"><font face="微软雅黑 "><font size="3"><font face="微软雅黑 "><strong><font color="#ff00">内容预告(呕心沥血的超级攻略放送ing):</font></strong></font><br>
37
+ <font color="#ff00"><font face="微软雅黑 ">NOW!全程旅馆预订tips更新中<br>
38
+ NEXT!<br>
39
+ 1、收集的参考资讯的连接tips<br>
40
+ 2、全程涉及交通工具的预订方式(网站、班次和价格)<br>
41
+ </font></font><font color="#ff00"><font face="微软雅黑 ">3、全程目的地地图、游玩路线、标准景点介绍、独特景点推荐、餐厅推荐、Shopping推荐、最佳摄影点及注意事项</font></font></font></font></font><br>
42
+ <br>
43
+ <font face="微软雅黑 "><font size="3"><strong>DIY的手绘行程图</strong></font></font><br>
44
+ <span style="position: absolute; display: none" id="attach_1254048" onmouseover="showMenu({'ctrlid':this.id,'pos':'13'})"><img src="images/go2eu/attachimg.gif" border="0"></span>
45
+ <img src="http://att.qyer.com/day_111228/1112282241c0deb3a69dcd9cf5.jpg" file="http://att.qyer.com/day_111228/1112282241c0deb3a69dcd9cf5.jpg" width="700" class="zoom" onclick="zoom(this, this.src)" id="aimg_1254048" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="55105222201110032253301023057594630_006.jpg">
46
+ <div class="t_attach" id="aimg_1254048_menu" style="position: absolute; display: none">
47
+ <a href="attachment.php?aid=MTI1NDA0OHxkYmZjODBmY3wxMzMyOTIzMTI4fGM0OTVjaUtKdjEveHl3OW1XSUFScll0MGtwWXFHRlNJUDV4S2ppbFMwU0p5TGNB&amp;nothumb=yes" title="55105222201110032253301023057594630_006.jpg" target="_blank"><strong>下载</strong></a> (247.14 KB)<br>
48
+ <div class="t_smallfont">2011-12-28 22:41</div>
49
+ </div>
50
+ <br>
51
+ <br>
52
+ <span style="position: absolute; display: none" id="attach_1254049" onmouseover="showMenu({'ctrlid':this.id,'pos':'13'})"><img src="images/go2eu/attachimg.gif" border="0"></span>
53
+ <img src="http://att.qyer.com/day_111228/1112282241a7c6f2711722bc9b.jpg" file="http://att.qyer.com/day_111228/1112282241a7c6f2711722bc9b.jpg" width="700" class="zoom" onclick="zoom(this, this.src)" id="aimg_1254049" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="55105222201110032253301023057594630_007.jpg">
54
+ <div class="t_attach" id="aimg_1254049_menu" style="position: absolute; z-index: 301; opacity: 1; left: 309px; top: 736px; display: none; ">
55
+ <a href="attachment.php?aid=MTI1NDA0OXxhYTE5MTBiMHwxMzMyOTIzMTI4fGM0OTVjaUtKdjEveHl3OW1XSUFScll0MGtwWXFHRlNJUDV4S2ppbFMwU0p5TGNB&amp;nothumb=yes" title="55105222201110032253301023057594630_007.jpg" target="_blank"><strong>下载</strong></a> (241.74 KB)<br>
56
+ <div class="t_smallfont">2011-12-28 22:41</div>
57
+ </div>
58
+ <br>
59
+ <font face="微软雅黑 "><font size="3">我们一行四人,第二个是我,很可爱吧~</font></font><br>
60
+ <br>
61
+ [<i> 本帖最后由 theme_of_ryan 于 2011-10-16 00:31 编辑 </i>]</td></tr></tbody></table>
62
+ CON
63
+ page = HTMLPage.new :contents => contents
64
+ page.img do |node,contents|
65
+ if node['src'].end_with? 'gif'
66
+ ''
67
+ elsif node['src'].start_with? 'http'
68
+ "![#{node['alt']}](#{node['src']} =300x)"
69
+ else
70
+ "![#{node['alt']}](http://bbs.qyer.com/#{node['src']} =300x)"
71
+ end
72
+ end
73
+ markdown = page.to_markdown page.contents
74
+ markdown.length.should > 0
75
+ puts markdown
76
+ end
77
+ end
@@ -0,0 +1,22 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ rescue LoadError
4
+ puts 'Although not required, bundler is recommended for running the tests.'
5
+ end
6
+
7
+ RSpec.configure do |config|
8
+ config.mock_with :rspec
9
+ config.color_enabled = true
10
+
11
+ config.before(:each) do
12
+
13
+ end
14
+
15
+ config.before(:each, :type => :controller) do
16
+ end
17
+
18
+ config.after(:all) do
19
+ end
20
+ end
21
+
22
+ require 'html2markdown'
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2markdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - mike lee
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-28 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &2155274260 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *2155274260
25
+ description: simple and flexible html to markdown converter
26
+ email:
27
+ - mike.d.1984@gmail.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - .gitignore
33
+ - Gemfile
34
+ - Rakefile
35
+ - Readme.md
36
+ - html2markdown.gemspec
37
+ - lib/html2markdown.rb
38
+ - lib/html2markdown/converter.rb
39
+ - lib/html2markdown/exceptions.rb
40
+ - lib/html2markdown/html_page.rb
41
+ - lib/html2markdown/version.rb
42
+ - spec/cases/converter_spec.rb
43
+ - spec/spec_helper.rb
44
+ homepage: ''
45
+ licenses: []
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.17
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: simple and flexible html to markdown converter
68
+ test_files:
69
+ - spec/cases/converter_spec.rb
70
+ - spec/spec_helper.rb