html2markdown 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +14 -0
- data/Rakefile +2 -0
- data/Readme.md +3 -0
- data/html2markdown.gemspec +19 -0
- data/lib/html2markdown.rb +7 -0
- data/lib/html2markdown/converter.rb +83 -0
- data/lib/html2markdown/exceptions.rb +3 -0
- data/lib/html2markdown/html_page.rb +18 -0
- data/lib/html2markdown/version.rb +3 -0
- data/spec/cases/converter_spec.rb +77 -0
- data/spec/spec_helper.rb +22 -0
- metadata +70 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/Readme.md
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/html2markdown/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["mike lee"]
|
6
|
+
gem.email = ["mike.d.1984@gmail.com"]
|
7
|
+
gem.description = %q{simple and flexible html to markdown converter}
|
8
|
+
gem.summary = %q{simple and flexible html to markdown converter}
|
9
|
+
gem.homepage = ""
|
10
|
+
|
11
|
+
gem.add_dependency 'nokogiri'
|
12
|
+
|
13
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
14
|
+
gem.files = `git ls-files`.split("\n")
|
15
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
gem.name = "html2markdown"
|
17
|
+
gem.require_paths = ["lib"]
|
18
|
+
gem.version = HTML2Markdown::VERSION
|
19
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require_relative 'exceptions'
|
3
|
+
|
4
|
+
module HTML2Markdown
|
5
|
+
module Converter
|
6
|
+
|
7
|
+
def to_markdown string_contents
|
8
|
+
raise NoContents unless string_contents!=nil and string_contents.is_a?(String)
|
9
|
+
doc = Nokogiri::HTML(string_contents)
|
10
|
+
doc.children.map { |ele| parse_element(ele) }.join
|
11
|
+
end
|
12
|
+
|
13
|
+
# a normal element
|
14
|
+
# maybe text
|
15
|
+
# maybe node
|
16
|
+
def parse_element(ele)
|
17
|
+
if ele.is_a? Nokogiri::XML::Text
|
18
|
+
return "#{ele.text}\n"
|
19
|
+
else
|
20
|
+
if (children = ele.children).count > 0
|
21
|
+
return wrap_node(ele,children.map {|ele| parse_element(ele)}.join )
|
22
|
+
else
|
23
|
+
return wrap_node(ele,ele.text)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# wrap node with markdown
|
29
|
+
def wrap_node(node,contents=nil)
|
30
|
+
result = ''
|
31
|
+
contents.strip! unless contents==nil
|
32
|
+
# check if there is a custom parse exist
|
33
|
+
if respond_to? "parse_#{node.name}"
|
34
|
+
return self.send("parse_#{node.name}",node,contents)
|
35
|
+
end
|
36
|
+
# skip hidden node
|
37
|
+
return '' if node['style'] and node['style'] =~ /display:\s*none/
|
38
|
+
# default parse
|
39
|
+
case node.name.downcase
|
40
|
+
when 'i'
|
41
|
+
when 'li'
|
42
|
+
result << "*#{contents}\n"
|
43
|
+
when 'blockquote'
|
44
|
+
contents.split('\n').each do |part|
|
45
|
+
result << ">#{contents}\n"
|
46
|
+
end
|
47
|
+
when 'strong'
|
48
|
+
result << "**#{contents}**\n"
|
49
|
+
when 'h1'
|
50
|
+
result << "##{contents}\n"
|
51
|
+
when 'h2'
|
52
|
+
result << "###{contents}\n"
|
53
|
+
when 'h3'
|
54
|
+
result << "####{contents}\n"
|
55
|
+
when 'hr'
|
56
|
+
result << "****\n"
|
57
|
+
when 'br'
|
58
|
+
result << "\n"
|
59
|
+
when 'img'
|
60
|
+
result << "![#{node['alt']}](#{node['src']})"
|
61
|
+
when 'a'
|
62
|
+
result << "[#{contents}](#{node['href']})"
|
63
|
+
else
|
64
|
+
result << contents unless contents == nil
|
65
|
+
end
|
66
|
+
result
|
67
|
+
end
|
68
|
+
|
69
|
+
# define custom node processor
|
70
|
+
def method_missing(name,*args,&block)
|
71
|
+
self.class.send :define_method,"parse_#{name}" do |node,contents|
|
72
|
+
block.call node,contents
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def debug
|
77
|
+
puts '----------------------------------'
|
78
|
+
puts yield
|
79
|
+
puts '----------------------------------'
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require_relative 'converter'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
class HTMLPage
|
5
|
+
include HTML2Markdown::Converter
|
6
|
+
attr_accessor :url,:contents
|
7
|
+
|
8
|
+
def initialize(options,&content_extrator)
|
9
|
+
@host = options[:host]
|
10
|
+
@url = options[:url]
|
11
|
+
if (@contents = options[:contents]) == nil
|
12
|
+
doc = Nokogiri::HTML(open(@url))
|
13
|
+
@contents = doc.at_css('body').send(:inner_html) || doc.inner_html
|
14
|
+
end
|
15
|
+
@content_extrator = content_extrator
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# coding:utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
require_relative '../../lib/html2markdown'
|
4
|
+
|
5
|
+
describe HTMLPage do
|
6
|
+
|
7
|
+
# it "should have url and contents property" do
|
8
|
+
# url = 'http://bbs.qyer.com/viewthread.php?tid=503325&extra=page%3D1'
|
9
|
+
# page = HTMLPage.new :url => url
|
10
|
+
# page.contents.should_not be_nil
|
11
|
+
# page.url.should == url
|
12
|
+
# end
|
13
|
+
|
14
|
+
it "can convert to markdow format" do
|
15
|
+
contents = <<-CON
|
16
|
+
\n<i class=\"pstatus\"> 本帖最后由 aria_lyy 于 2012-3-21 22:22 编辑 </i><br><br><strong>
|
17
|
+
<font color=\"Navy\"><font size=\"4\"><font size=\"5\">看贴前请先看这儿——如果您不喜欢旅游,
|
18
|
+
不喜欢自助旅游,不喜欢动点脑筋玩儿的舒坦点,不喜欢花点时间挑选旅行时自己可心儿的衣食住行景,
|
19
|
+
您就赶紧把这个页面关了吧,不然实在浪费您的时间和国家那所剩无几的电资源</font></font></font></strong>
|
20
|
+
<br><br><font color=\"Red\"><font size=\"5\">【前言】</font></font><br><br><strong>
|
21
|
+
<font size=\"4\"><font color=\"Navy\">【关于去旅游和不去旅游的理由】<br>\r\n1.如果你喜欢旅游尤其自助游,却有各种“不能去旅游的理由”请跳转至2、3、4、5、6、7!
|
22
|
+
<img src=\"images/smilies/default/lol.gif\" smilieid=\"12\" border=\"0\" alt=\"\"> 如果您没有那些讨厌的理由,请跳转至8!
|
23
|
+
<img src=\"images/smilies/default/loveliness.gif\" smilieid=\"28\" border=\"0\" alt=\"\"><br>\r\n2.如果你认为是钱阻挡了旅游的步伐?LZ也是个工薪阶层,也是每年辛苦工作,攒出旅游的钱;再者说,哪怕只走个国内游,也算开了眼界、见了世面,钱,不是问题,更何况有许多人坚持穷游概念,钱就更不是问题;<br>\r\n3.如果你认为是假期阻挡了旅游的步伐?LZ和内口子每年也只有7天年假,却仍能挤一挤,腾出10天左右的时间去趟国外逍遥一番;再者说,哪怕只是4天的海岛游,也算晒出个富豪脸~别说工作放不下,那是你根本不想去旅游;<br>\r\n4.如果你认为是语言阻挡了旅游的步伐?LZ2006年第一次去意大利、2009年第二次走意大利、走希腊、走法国,2010年去南非、马来西亚,2011年马耳他、意大利、马来西亚,再到今年这趟西班牙,从来!从来从来!没因为二把刀(也就徘徊在国家英语三到四级的边缘)不成句的英语迷路、饿肚子、丢东西丢人、误火车飞机大炮、买不成东西,关键我们去的很多都是非英语国家,经常会出现我们用英语跟别人交流,别人用叽里呱啦的母语跟我们交谈,然后解决一件特别难的大事儿~~GOOD!东特哇瑞,别担心沟通有问题,那不是旅行的重点!<br>\r\n5.没伙伴?不旅行?哇塞,你一定是孤独症患者;<br>\r\n6.爹妈不让?不旅行?哇塞,还是去街心花园吧;<br>\r\n7.懒得查这个那个?你如果愿意点开这个帖子,就说明你根本不懒~</font></font></strong><br><strong><font size=\"5\"><font color=\"DarkRed\">8.那就揣着护照,动身吧!!!!!</font></font></strong>\n
|
24
|
+
CON
|
25
|
+
page = HTMLPage.new :contents => contents
|
26
|
+
markdown = page.to_markdown page.contents
|
27
|
+
markdown.length.should > 0
|
28
|
+
end
|
29
|
+
|
30
|
+
it "can accept custom parse" do
|
31
|
+
contents = <<-CON
|
32
|
+
<table cellspacing="0" cellpadding="0"><tbody><tr><td class="t_msgfont" id="postmessage_4983254"><i class="pstatus"> 本帖最后由 theme_of_ryan 于 2011-12-29 22:21 编辑 </i><br>
|
33
|
+
<br>
|
34
|
+
<font size="4"><font face="微软雅黑 "><font size="3"><font face="微软雅黑 ">今年7月和几个<span href="tag.php?name=%E6%9C%8B%E5%8F%8B" onclick="tagshow(event)" class="t_tag">朋友</span>在吃饭漫无边际聊天的时候,突然提出今年12月去<span href="tag.php?name=%E6%B3%95%E5%9B%BD" onclick="tagshow(event)" class="t_tag">法国</span>和西班牙深度游的想法,当晚查询<span href="tag.php?name=%E6%9C%BA%E7%A5%A8" onclick="tagshow(event)" class="t_tag">机票</span>刚好遇到特价机票,毫不犹豫果断出票。去年我和男朋友自助游去过<span href="tag.php?name=%E6%84%8F%E5%A4%A7%E5%88%A9" onclick="tagshow(event)" class="t_tag">意大利</span>、克罗地亚和希腊22天(<strong><a href="http://bbs.go2eu.com/viewthread.php?tid=494274" target="_blank"><strong><font color="#0066cc">游记点这里</font></strong></a></strong>),所以做攻略的任务便落在了我身上。于是……我慢悠悠的开始准备阶段,现在接近完成阶段即开帖和大家分享当中的乐趣,把我最感兴趣、最关注的关键点写在帖子里,偶尔还需要和大家请教过程中遇到的问题。</font><font face="微软雅黑 "></font> <br>
|
35
|
+
<font face="微软雅黑 "><strong><font color="#ff00"></font></strong></font></font></font></font><br>
|
36
|
+
<font size="4"><font face="微软雅黑 "><font size="3"><font face="微软雅黑 "><strong><font color="#ff00">内容预告(呕心沥血的超级攻略放送ing):</font></strong></font><br>
|
37
|
+
<font color="#ff00"><font face="微软雅黑 ">NOW!全程旅馆预订tips更新中<br>
|
38
|
+
NEXT!<br>
|
39
|
+
1、收集的参考资讯的连接tips<br>
|
40
|
+
2、全程涉及交通工具的预订方式(网站、班次和价格)<br>
|
41
|
+
</font></font><font color="#ff00"><font face="微软雅黑 ">3、全程目的地地图、游玩路线、标准景点介绍、独特景点推荐、餐厅推荐、Shopping推荐、最佳摄影点及注意事项</font></font></font></font></font><br>
|
42
|
+
<br>
|
43
|
+
<font face="微软雅黑 "><font size="3"><strong>DIY的手绘行程图</strong></font></font><br>
|
44
|
+
<span style="position: absolute; display: none" id="attach_1254048" onmouseover="showMenu({'ctrlid':this.id,'pos':'13'})"><img src="images/go2eu/attachimg.gif" border="0"></span>
|
45
|
+
<img src="http://att.qyer.com/day_111228/1112282241c0deb3a69dcd9cf5.jpg" file="http://att.qyer.com/day_111228/1112282241c0deb3a69dcd9cf5.jpg" width="700" class="zoom" onclick="zoom(this, this.src)" id="aimg_1254048" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="55105222201110032253301023057594630_006.jpg">
|
46
|
+
<div class="t_attach" id="aimg_1254048_menu" style="position: absolute; display: none">
|
47
|
+
<a href="attachment.php?aid=MTI1NDA0OHxkYmZjODBmY3wxMzMyOTIzMTI4fGM0OTVjaUtKdjEveHl3OW1XSUFScll0MGtwWXFHRlNJUDV4S2ppbFMwU0p5TGNB&nothumb=yes" title="55105222201110032253301023057594630_006.jpg" target="_blank"><strong>下载</strong></a> (247.14 KB)<br>
|
48
|
+
<div class="t_smallfont">2011-12-28 22:41</div>
|
49
|
+
</div>
|
50
|
+
<br>
|
51
|
+
<br>
|
52
|
+
<span style="position: absolute; display: none" id="attach_1254049" onmouseover="showMenu({'ctrlid':this.id,'pos':'13'})"><img src="images/go2eu/attachimg.gif" border="0"></span>
|
53
|
+
<img src="http://att.qyer.com/day_111228/1112282241a7c6f2711722bc9b.jpg" file="http://att.qyer.com/day_111228/1112282241a7c6f2711722bc9b.jpg" width="700" class="zoom" onclick="zoom(this, this.src)" id="aimg_1254049" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="55105222201110032253301023057594630_007.jpg">
|
54
|
+
<div class="t_attach" id="aimg_1254049_menu" style="position: absolute; z-index: 301; opacity: 1; left: 309px; top: 736px; display: none; ">
|
55
|
+
<a href="attachment.php?aid=MTI1NDA0OXxhYTE5MTBiMHwxMzMyOTIzMTI4fGM0OTVjaUtKdjEveHl3OW1XSUFScll0MGtwWXFHRlNJUDV4S2ppbFMwU0p5TGNB&nothumb=yes" title="55105222201110032253301023057594630_007.jpg" target="_blank"><strong>下载</strong></a> (241.74 KB)<br>
|
56
|
+
<div class="t_smallfont">2011-12-28 22:41</div>
|
57
|
+
</div>
|
58
|
+
<br>
|
59
|
+
<font face="微软雅黑 "><font size="3">我们一行四人,第二个是我,很可爱吧~</font></font><br>
|
60
|
+
<br>
|
61
|
+
[<i> 本帖最后由 theme_of_ryan 于 2011-10-16 00:31 编辑 </i>]</td></tr></tbody></table>
|
62
|
+
CON
|
63
|
+
page = HTMLPage.new :contents => contents
|
64
|
+
page.img do |node,contents|
|
65
|
+
if node['src'].end_with? 'gif'
|
66
|
+
''
|
67
|
+
elsif node['src'].start_with? 'http'
|
68
|
+
"![#{node['alt']}](#{node['src']} =300x)"
|
69
|
+
else
|
70
|
+
"![#{node['alt']}](http://bbs.qyer.com/#{node['src']} =300x)"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
markdown = page.to_markdown page.contents
|
74
|
+
markdown.length.should > 0
|
75
|
+
puts markdown
|
76
|
+
end
|
77
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
begin
|
2
|
+
require 'bundler/setup'
|
3
|
+
rescue LoadError
|
4
|
+
puts 'Although not required, bundler is recommended for running the tests.'
|
5
|
+
end
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.mock_with :rspec
|
9
|
+
config.color_enabled = true
|
10
|
+
|
11
|
+
config.before(:each) do
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
config.before(:each, :type => :controller) do
|
16
|
+
end
|
17
|
+
|
18
|
+
config.after(:all) do
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
require 'html2markdown'
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: html2markdown
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- mike lee
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-03-28 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &2155274260 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *2155274260
|
25
|
+
description: simple and flexible html to markdown converter
|
26
|
+
email:
|
27
|
+
- mike.d.1984@gmail.com
|
28
|
+
executables: []
|
29
|
+
extensions: []
|
30
|
+
extra_rdoc_files: []
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- Gemfile
|
34
|
+
- Rakefile
|
35
|
+
- Readme.md
|
36
|
+
- html2markdown.gemspec
|
37
|
+
- lib/html2markdown.rb
|
38
|
+
- lib/html2markdown/converter.rb
|
39
|
+
- lib/html2markdown/exceptions.rb
|
40
|
+
- lib/html2markdown/html_page.rb
|
41
|
+
- lib/html2markdown/version.rb
|
42
|
+
- spec/cases/converter_spec.rb
|
43
|
+
- spec/spec_helper.rb
|
44
|
+
homepage: ''
|
45
|
+
licenses: []
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirements: []
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 1.8.17
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: simple and flexible html to markdown converter
|
68
|
+
test_files:
|
69
|
+
- spec/cases/converter_spec.rb
|
70
|
+
- spec/spec_helper.rb
|