markitdown 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -1
- data/lib/markitdown.rb +13 -7
- data/lib/markitdown/version.rb +1 -1
- data/markitdown.gemspec +1 -0
- data/spec/code.html +38 -0
- data/spec/code_spec.rb +25 -0
- data/spec/code_with_language.markdown +40 -0
- data/spec/code_without_language.markdown +40 -0
- data/spec/doc_spec.rb +1 -0
- data/spec/nesting_spec.rb +1 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/table_spec.rb +1 -0
- data/spec/tag_spec.rb +2 -1
- metadata +30 -4
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Markitdown [![Build Status](https://secure.travis-ci.org/cpetersen/markitdown.png)](http://travis-ci.org/cpetersen/markitdown)
|
1
|
+
# Markitdown [![Build Status](https://secure.travis-ci.org/cpetersen/markitdown.png)](http://travis-ci.org/cpetersen/markitdown) [![Coverage Status](https://coveralls.io/repos/cpetersen/markitdown/badge.png?branch=master)](https://coveralls.io/r/cpetersen/markitdown?branch=master)
|
2
2
|
|
3
3
|
Markitdown is a Ruby library that converts HTML to Markdown. It's powered by Nokogiri. It supports:
|
4
4
|
|
data/lib/markitdown.rb
CHANGED
@@ -4,20 +4,20 @@ require "markitdown/version"
|
|
4
4
|
require "nokogiri"
|
5
5
|
|
6
6
|
module Markitdown
|
7
|
-
def self.from_html(html)
|
8
|
-
from_nokogiri(Nokogiri::XML(html).root)
|
7
|
+
def self.from_html(html, language_classifier=nil)
|
8
|
+
from_nokogiri(Nokogiri::XML(html).root, language_classifier)
|
9
9
|
end
|
10
10
|
|
11
|
-
def self.from_nokogiri(node)
|
11
|
+
def self.from_nokogiri(node, language_classifier=nil)
|
12
12
|
# gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
|
13
13
|
# gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
|
14
14
|
# gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
|
15
15
|
# gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
|
16
|
-
self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
|
16
|
+
self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
|
17
17
|
end
|
18
18
|
|
19
19
|
private
|
20
|
-
def self.parse_node(node, states=[])
|
20
|
+
def self.parse_node(node, states=[], language_classifier=nil)
|
21
21
|
results=[]
|
22
22
|
after = nil
|
23
23
|
states.unshift node.name.downcase
|
@@ -141,7 +141,13 @@ module Markitdown
|
|
141
141
|
results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
|
142
142
|
when "code"
|
143
143
|
if node.text.include?("\n")
|
144
|
-
|
144
|
+
text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
|
145
|
+
if language_classifier
|
146
|
+
language = language_classifier.classify(text)
|
147
|
+
results << "\n\n```#{language}\n#{text}\n```\n\n"
|
148
|
+
else
|
149
|
+
results << "\n\n```\n#{text}\n```\n\n"
|
150
|
+
end
|
145
151
|
else
|
146
152
|
results << " `#{node.text}` "
|
147
153
|
end
|
@@ -171,7 +177,7 @@ module Markitdown
|
|
171
177
|
|
172
178
|
if recurse
|
173
179
|
node.children.each do |child|
|
174
|
-
contents = self.parse_node(child, states)
|
180
|
+
contents = self.parse_node(child, states, language_classifier)
|
175
181
|
contents = contents.flatten.compact.join.strip if strip_content
|
176
182
|
contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
|
177
183
|
results << contents
|
data/lib/markitdown/version.rb
CHANGED
data/markitdown.gemspec
CHANGED
@@ -11,6 +11,7 @@ Gem::Specification.new do |gem|
|
|
11
11
|
gem.add_dependency('nokogiri')
|
12
12
|
gem.add_development_dependency('rake')
|
13
13
|
gem.add_development_dependency('rspec')
|
14
|
+
gem.add_development_dependency('coveralls')
|
14
15
|
|
15
16
|
gem.files = `git ls-files`.split($\)
|
16
17
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
data/spec/code.html
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
<div>
|
2
|
+
This is an html block
|
3
|
+
<code>
|
4
|
+
<table>
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
|
+
</table>
|
25
|
+
</code>
|
26
|
+
|
27
|
+
This is a ruby block
|
28
|
+
<code>
|
29
|
+
# GET /blogs/1
|
30
|
+
# GET /blogs/1.json
|
31
|
+
def show
|
32
|
+
respond_to do |format|
|
33
|
+
format.html # show.html.erb
|
34
|
+
format.json { render json: @blog }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
</code>
|
38
|
+
</div>
|
data/spec/code_spec.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'markitdown'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Markitdown do
|
5
|
+
context "When parsing codeblocks" do
|
6
|
+
let(:html) { File.read("spec/code.html") }
|
7
|
+
|
8
|
+
context "and not guessing the language" do
|
9
|
+
let(:markdown) { File.read("spec/code_without_language.markdown") }
|
10
|
+
|
11
|
+
it "should produce valid markdown" do
|
12
|
+
Markitdown.from_html(html).should == markdown
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
context "and guessing the language" do
|
17
|
+
let(:markdown) { File.read("spec/code_with_language.markdown") }
|
18
|
+
let(:classifier) { TestLanguageClassifier.new }
|
19
|
+
|
20
|
+
it "should produce valid markdown" do
|
21
|
+
Markitdown.from_html(html, classifier).should == markdown
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
This is an html block
|
2
|
+
|
3
|
+
```html
|
4
|
+
<table>
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
|
+
</table>
|
25
|
+
```
|
26
|
+
|
27
|
+
This is a ruby block
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
# GET /blogs/1
|
31
|
+
# GET /blogs/1.json
|
32
|
+
def show
|
33
|
+
respond_to do |format|
|
34
|
+
format.html # show.html.erb
|
35
|
+
format.json { render json: @blog }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
```
|
39
|
+
|
40
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
This is an html block
|
2
|
+
|
3
|
+
```
|
4
|
+
<table>
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
|
+
</table>
|
25
|
+
```
|
26
|
+
|
27
|
+
This is a ruby block
|
28
|
+
|
29
|
+
```
|
30
|
+
# GET /blogs/1
|
31
|
+
# GET /blogs/1.json
|
32
|
+
def show
|
33
|
+
respond_to do |format|
|
34
|
+
format.html # show.html.erb
|
35
|
+
format.json { render json: @blog }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
```
|
39
|
+
|
40
|
+
|
data/spec/doc_spec.rb
CHANGED
data/spec/nesting_spec.rb
CHANGED
data/spec/spec_helper.rb
ADDED
data/spec/table_spec.rb
CHANGED
data/spec/tag_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'markitdown'
|
2
|
+
require 'spec_helper'
|
2
3
|
|
3
4
|
describe Markitdown do
|
4
5
|
context "When parsing a paragraph" do
|
@@ -166,7 +167,7 @@ describe Markitdown do
|
|
166
167
|
|
167
168
|
it "should return valid markdown with spaces" do
|
168
169
|
pending "Still need to figure out leading spaces for <sup> elements"
|
169
|
-
Markitdown.from_html(html).should == "This ^(is a) test"
|
170
|
+
# Markitdown.from_html(html).should == "This ^(is a) test"
|
170
171
|
end
|
171
172
|
end
|
172
173
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -59,6 +59,22 @@ dependencies:
|
|
59
59
|
- - ! '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: coveralls
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
62
78
|
description: A library that uses Nokogiri to parse HTML and produce Markdown
|
63
79
|
email:
|
64
80
|
- christopher.petersen@gmail.com
|
@@ -77,12 +93,17 @@ files:
|
|
77
93
|
- markitdown.gemspec
|
78
94
|
- spec/asmartbear.html
|
79
95
|
- spec/asmartbear.markdown
|
96
|
+
- spec/code.html
|
97
|
+
- spec/code_spec.rb
|
98
|
+
- spec/code_with_language.markdown
|
99
|
+
- spec/code_without_language.markdown
|
80
100
|
- spec/doc.html
|
81
101
|
- spec/doc.markdown
|
82
102
|
- spec/doc_spec.rb
|
83
103
|
- spec/evernote.markdown
|
84
104
|
- spec/evernote.xml
|
85
105
|
- spec/nesting_spec.rb
|
106
|
+
- spec/spec_helper.rb
|
86
107
|
- spec/table.html
|
87
108
|
- spec/table.markdown
|
88
109
|
- spec/table2.html
|
@@ -103,7 +124,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
124
|
version: '0'
|
104
125
|
segments:
|
105
126
|
- 0
|
106
|
-
hash: -
|
127
|
+
hash: -1182280358588269194
|
107
128
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
129
|
none: false
|
109
130
|
requirements:
|
@@ -112,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
133
|
version: '0'
|
113
134
|
segments:
|
114
135
|
- 0
|
115
|
-
hash: -
|
136
|
+
hash: -1182280358588269194
|
116
137
|
requirements: []
|
117
138
|
rubyforge_project:
|
118
139
|
rubygems_version: 1.8.23
|
@@ -122,12 +143,17 @@ summary: Converts HTML to Markdown
|
|
122
143
|
test_files:
|
123
144
|
- spec/asmartbear.html
|
124
145
|
- spec/asmartbear.markdown
|
146
|
+
- spec/code.html
|
147
|
+
- spec/code_spec.rb
|
148
|
+
- spec/code_with_language.markdown
|
149
|
+
- spec/code_without_language.markdown
|
125
150
|
- spec/doc.html
|
126
151
|
- spec/doc.markdown
|
127
152
|
- spec/doc_spec.rb
|
128
153
|
- spec/evernote.markdown
|
129
154
|
- spec/evernote.xml
|
130
155
|
- spec/nesting_spec.rb
|
156
|
+
- spec/spec_helper.rb
|
131
157
|
- spec/table.html
|
132
158
|
- spec/table.markdown
|
133
159
|
- spec/table2.html
|