html2markdown 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source 'http://rubygems.org'
2
+
3
+ group :development, :test do
4
+ gem 'rspec'
5
+ # Testing infrastructure
6
+ gem 'guard'
7
+ gem 'guard-rspec'
8
+
9
+ end
10
+
11
+ gem 'nokogiri'
12
+
13
+ # Specify your gem's dependencies in html2markdown.gemspec
14
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
data/Readme.md ADDED
@@ -0,0 +1,3 @@
1
+ ### Yet Another Html to Markdown ruby lib
2
+ we love markdown, cause it is friendly to edit
3
+ So we want everything to be markdown
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/html2markdown/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["mike lee"]
6
+ gem.email = ["mike.d.1984@gmail.com"]
7
+ gem.description = %q{simple and flexible html to markdown converter}
8
+ gem.summary = %q{simple and flexible html to markdown converter}
9
+ gem.homepage = ""
10
+
11
+ gem.add_dependency 'nokogiri'
12
+
13
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
14
+ gem.files = `git ls-files`.split("\n")
15
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ gem.name = "html2markdown"
17
+ gem.require_paths = ["lib"]
18
+ gem.version = HTML2Markdown::VERSION
19
+ end
@@ -0,0 +1,7 @@
1
+ require "html2markdown/version"
2
+ require_relative 'html2markdown/converter'
3
+ require_relative 'html2markdown/html_page'
4
+
5
+ module HTML2Markdown
6
+ # Your code goes here...
7
+ end
@@ -0,0 +1,83 @@
1
+ require 'nokogiri'
2
+ require_relative 'exceptions'
3
+
4
+ module HTML2Markdown
5
+ module Converter
6
+
7
+ def to_markdown string_contents
8
+ raise NoContents unless string_contents!=nil and string_contents.is_a?(String)
9
+ doc = Nokogiri::HTML(string_contents)
10
+ doc.children.map { |ele| parse_element(ele) }.join
11
+ end
12
+
13
+ # a normal element
14
+ # maybe text
15
+ # maybe node
16
+ def parse_element(ele)
17
+ if ele.is_a? Nokogiri::XML::Text
18
+ return "#{ele.text}\n"
19
+ else
20
+ if (children = ele.children).count > 0
21
+ return wrap_node(ele,children.map {|ele| parse_element(ele)}.join )
22
+ else
23
+ return wrap_node(ele,ele.text)
24
+ end
25
+ end
26
+ end
27
+
28
+ # wrap node with markdown
29
+ def wrap_node(node,contents=nil)
30
+ result = ''
31
+ contents.strip! unless contents==nil
32
+ # check if there is a custom parse exist
33
+ if respond_to? "parse_#{node.name}"
34
+ return self.send("parse_#{node.name}",node,contents)
35
+ end
36
+ # skip hidden node
37
+ return '' if node['style'] and node['style'] =~ /display:\s*none/
38
+ # default parse
39
+ case node.name.downcase
40
+ when 'i'
41
+ when 'li'
42
+ result << "*#{contents}\n"
43
+ when 'blockquote'
44
+ contents.split('\n').each do |part|
45
+ result << ">#{contents}\n"
46
+ end
47
+ when 'strong'
48
+ result << "**#{contents}**\n"
49
+ when 'h1'
50
+ result << "##{contents}\n"
51
+ when 'h2'
52
+ result << "###{contents}\n"
53
+ when 'h3'
54
+ result << "####{contents}\n"
55
+ when 'hr'
56
+ result << "****\n"
57
+ when 'br'
58
+ result << "\n"
59
+ when 'img'
60
+ result << "![#{node['alt']}](#{node['src']})"
61
+ when 'a'
62
+ result << "[#{contents}](#{node['href']})"
63
+ else
64
+ result << contents unless contents == nil
65
+ end
66
+ result
67
+ end
68
+
69
+ # define custom node processor
70
+ def method_missing(name,*args,&block)
71
+ self.class.send :define_method,"parse_#{name}" do |node,contents|
72
+ block.call node,contents
73
+ end
74
+ end
75
+
76
+ def debug
77
+ puts '----------------------------------'
78
+ puts yield
79
+ puts '----------------------------------'
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,3 @@
1
+ module HTML2Markdown
2
+ class NoContents < Exception;end
3
+ end
@@ -0,0 +1,18 @@
1
+ require_relative 'converter'
2
+ require 'open-uri'
3
+
4
+ class HTMLPage
5
+ include HTML2Markdown::Converter
6
+ attr_accessor :url,:contents
7
+
8
+ def initialize(options,&content_extrator)
9
+ @host = options[:host]
10
+ @url = options[:url]
11
+ if (@contents = options[:contents]) == nil
12
+ doc = Nokogiri::HTML(open(@url))
13
+ @contents = doc.at_css('body').send(:inner_html) || doc.inner_html
14
+ end
15
+ @content_extrator = content_extrator
16
+ end
17
+
18
+ end
@@ -0,0 +1,3 @@
1
+ module HTML2Markdown
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,77 @@
1
+ # coding:utf-8
2
+ require 'spec_helper'
3
+ require_relative '../../lib/html2markdown'
4
+
5
+ describe HTMLPage do
6
+
7
+ # it "should have url and contents property" do
8
+ # url = 'http://bbs.qyer.com/viewthread.php?tid=503325&extra=page%3D1'
9
+ # page = HTMLPage.new :url => url
10
+ # page.contents.should_not be_nil
11
+ # page.url.should == url
12
+ # end
13
+
14
+ it "can convert to markdow format" do
15
+ contents = <<-CON
16
+ \n<i class=\"pstatus\"> 本帖最后由 aria_lyy 于 2012-3-21 22:22 编辑 </i><br><br><strong>
17
+ <font color=\"Navy\"><font size=\"4\"><font size=\"5\">看贴前请先看这儿——如果您不喜欢旅游,
18
+ 不喜欢自助旅游,不喜欢动点脑筋玩儿的舒坦点,不喜欢花点时间挑选旅行时自己可心儿的衣食住行景,
19
+ 您就赶紧把这个页面关了吧,不然实在浪费您的时间和国家那所剩无几的电资源</font></font></font></strong>
20
+ <br><br><font color=\"Red\"><font size=\"5\">【前言】</font></font><br><br><strong>
21
+ <font size=\"4\"><font color=\"Navy\">【关于去旅游和不去旅游的理由】<br>\r\n1.如果你喜欢旅游尤其自助游,却有各种“不能去旅游的理由”请跳转至2、3、4、5、6、7!
22
+ <img src=\"images/smilies/default/lol.gif\" smilieid=\"12\" border=\"0\" alt=\"\"> 如果您没有那些讨厌的理由,请跳转至8!
23
+ <img src=\"images/smilies/default/loveliness.gif\" smilieid=\"28\" border=\"0\" alt=\"\"><br>\r\n2.如果你认为是钱阻挡了旅游的步伐?LZ也是个工薪阶层,也是每年辛苦工作,攒出旅游的钱;再者说,哪怕只走个国内游,也算开了眼界、见了世面,钱,不是问题,更何况有许多人坚持穷游概念,钱就更不是问题;<br>\r\n3.如果你认为是假期阻挡了旅游的步伐?LZ和内口子每年也只有7天年假,却仍能挤一挤,腾出10天左右的时间去趟国外逍遥一番;再者说,哪怕只是4天的海岛游,也算晒出个富豪脸~别说工作放不下,那是你根本不想去旅游;<br>\r\n4.如果你认为是语言阻挡了旅游的步伐?LZ2006年第一次去意大利、2009年第二次走意大利、走希腊、走法国,2010年去南非、马来西亚,2011年马耳他、意大利、马来西亚,再到今年这趟西班牙,从来!从来从来!没因为二把刀(也就徘徊在国家英语三到四级的边缘)不成句的英语迷路、饿肚子、丢东西丢人、误火车飞机大炮、买不成东西,关键我们去的很多都是非英语国家,经常会出现我们用英语跟别人交流,别人用叽里呱啦的母语跟我们交谈,然后解决一件特别难的大事儿~~GOOD!东特哇瑞,别担心沟通有问题,那不是旅行的重点!<br>\r\n5.没伙伴?不旅行?哇塞,你一定是孤独症患者;<br>\r\n6.爹妈不让?不旅行?哇塞,还是去街心花园吧;<br>\r\n7.懒得查这个那个?你如果愿意点开这个帖子,就说明你根本不懒~</font></font></strong><br><strong><font size=\"5\"><font color=\"DarkRed\">8.那就揣着护照,动身吧!!!!!</font></font></strong>\n
24
+ CON
25
+ page = HTMLPage.new :contents => contents
26
+ markdown = page.to_markdown page.contents
27
+ markdown.length.should > 0
28
+ end
29
+
30
+ it "can accept custom parse" do
31
+ contents = <<-CON
32
+ <table cellspacing="0" cellpadding="0"><tbody><tr><td class="t_msgfont" id="postmessage_4983254"><i class="pstatus"> 本帖最后由 theme_of_ryan 于 2011-12-29 22:21 编辑 </i><br>
33
+ <br>
34
+ <font size="4"><font face="微软雅黑 "><font size="3"><font face="微软雅黑 ">今年7月和几个<span href="tag.php?name=%E6%9C%8B%E5%8F%8B" onclick="tagshow(event)" class="t_tag">朋友</span>在吃饭漫无边际聊天的时候,突然提出今年12月去<span href="tag.php?name=%E6%B3%95%E5%9B%BD" onclick="tagshow(event)" class="t_tag">法国</span>和西班牙深度游的想法,当晚查询<span href="tag.php?name=%E6%9C%BA%E7%A5%A8" onclick="tagshow(event)" class="t_tag">机票</span>刚好遇到特价机票,毫不犹豫果断出票。去年我和男朋友自助游去过<span href="tag.php?name=%E6%84%8F%E5%A4%A7%E5%88%A9" onclick="tagshow(event)" class="t_tag">意大利</span>、克罗地亚和希腊22天(<strong><a href="http://bbs.go2eu.com/viewthread.php?tid=494274" target="_blank"><strong><font color="#0066cc">游记点这里</font></strong></a></strong>),所以做攻略的任务便落在了我身上。于是……我慢悠悠的开始准备阶段,现在接近完成阶段即开帖和大家分享当中的乐趣,把我最感兴趣、最关注的关键点写在帖子里,偶尔还需要和大家请教过程中遇到的问题。</font><font face="微软雅黑 "></font> <br>
35
+ <font face="微软雅黑 "><strong><font color="#ff00"></font></strong></font></font></font></font><br>
36
+ <font size="4"><font face="微软雅黑 "><font size="3"><font face="微软雅黑 "><strong><font color="#ff00">内容预告(呕心沥血的超级攻略放送ing):</font></strong></font><br>
37
+ <font color="#ff00"><font face="微软雅黑 ">NOW!全程旅馆预订tips更新中<br>
38
+ NEXT!<br>
39
+ 1、收集的参考资讯的连接tips<br>
40
+ 2、全程涉及交通工具的预订方式(网站、班次和价格)<br>
41
+ </font></font><font color="#ff00"><font face="微软雅黑 ">3、全程目的地地图、游玩路线、标准景点介绍、独特景点推荐、餐厅推荐、Shopping推荐、最佳摄影点及注意事项</font></font></font></font></font><br>
42
+ <br>
43
+ <font face="微软雅黑 "><font size="3"><strong>DIY的手绘行程图</strong></font></font><br>
44
+ <span style="position: absolute; display: none" id="attach_1254048" onmouseover="showMenu({'ctrlid':this.id,'pos':'13'})"><img src="images/go2eu/attachimg.gif" border="0"></span>
45
+ <img src="http://att.qyer.com/day_111228/1112282241c0deb3a69dcd9cf5.jpg" file="http://att.qyer.com/day_111228/1112282241c0deb3a69dcd9cf5.jpg" width="700" class="zoom" onclick="zoom(this, this.src)" id="aimg_1254048" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="55105222201110032253301023057594630_006.jpg">
46
+ <div class="t_attach" id="aimg_1254048_menu" style="position: absolute; display: none">
47
+ <a href="attachment.php?aid=MTI1NDA0OHxkYmZjODBmY3wxMzMyOTIzMTI4fGM0OTVjaUtKdjEveHl3OW1XSUFScll0MGtwWXFHRlNJUDV4S2ppbFMwU0p5TGNB&amp;nothumb=yes" title="55105222201110032253301023057594630_006.jpg" target="_blank"><strong>下载</strong></a> (247.14 KB)<br>
48
+ <div class="t_smallfont">2011-12-28 22:41</div>
49
+ </div>
50
+ <br>
51
+ <br>
52
+ <span style="position: absolute; display: none" id="attach_1254049" onmouseover="showMenu({'ctrlid':this.id,'pos':'13'})"><img src="images/go2eu/attachimg.gif" border="0"></span>
53
+ <img src="http://att.qyer.com/day_111228/1112282241a7c6f2711722bc9b.jpg" file="http://att.qyer.com/day_111228/1112282241a7c6f2711722bc9b.jpg" width="700" class="zoom" onclick="zoom(this, this.src)" id="aimg_1254049" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="55105222201110032253301023057594630_007.jpg">
54
+ <div class="t_attach" id="aimg_1254049_menu" style="position: absolute; z-index: 301; opacity: 1; left: 309px; top: 736px; display: none; ">
55
+ <a href="attachment.php?aid=MTI1NDA0OXxhYTE5MTBiMHwxMzMyOTIzMTI4fGM0OTVjaUtKdjEveHl3OW1XSUFScll0MGtwWXFHRlNJUDV4S2ppbFMwU0p5TGNB&amp;nothumb=yes" title="55105222201110032253301023057594630_007.jpg" target="_blank"><strong>下载</strong></a> (241.74 KB)<br>
56
+ <div class="t_smallfont">2011-12-28 22:41</div>
57
+ </div>
58
+ <br>
59
+ <font face="微软雅黑 "><font size="3">我们一行四人,第二个是我,很可爱吧~</font></font><br>
60
+ <br>
61
+ [<i> 本帖最后由 theme_of_ryan 于 2011-10-16 00:31 编辑 </i>]</td></tr></tbody></table>
62
+ CON
63
+ page = HTMLPage.new :contents => contents
64
+ page.img do |node,contents|
65
+ if node['src'].end_with? 'gif'
66
+ ''
67
+ elsif node['src'].start_with? 'http'
68
+ "![#{node['alt']}](#{node['src']} =300x)"
69
+ else
70
+ "![#{node['alt']}](http://bbs.qyer.com/#{node['src']} =300x)"
71
+ end
72
+ end
73
+ markdown = page.to_markdown page.contents
74
+ markdown.length.should > 0
75
+ puts markdown
76
+ end
77
+ end
@@ -0,0 +1,22 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ rescue LoadError
4
+ puts 'Although not required, bundler is recommended for running the tests.'
5
+ end
6
+
7
+ RSpec.configure do |config|
8
+ config.mock_with :rspec
9
+ config.color_enabled = true
10
+
11
+ config.before(:each) do
12
+
13
+ end
14
+
15
+ config.before(:each, :type => :controller) do
16
+ end
17
+
18
+ config.after(:all) do
19
+ end
20
+ end
21
+
22
+ require 'html2markdown'
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html2markdown
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - mike lee
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-28 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &2155274260 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *2155274260
25
+ description: simple and flexible html to markdown converter
26
+ email:
27
+ - mike.d.1984@gmail.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - .gitignore
33
+ - Gemfile
34
+ - Rakefile
35
+ - Readme.md
36
+ - html2markdown.gemspec
37
+ - lib/html2markdown.rb
38
+ - lib/html2markdown/converter.rb
39
+ - lib/html2markdown/exceptions.rb
40
+ - lib/html2markdown/html_page.rb
41
+ - lib/html2markdown/version.rb
42
+ - spec/cases/converter_spec.rb
43
+ - spec/spec_helper.rb
44
+ homepage: ''
45
+ licenses: []
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.17
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: simple and flexible html to markdown converter
68
+ test_files:
69
+ - spec/cases/converter_spec.rb
70
+ - spec/spec_helper.rb