markitdown 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/lib/markitdown.rb +13 -7
- data/lib/markitdown/version.rb +1 -1
- data/markitdown.gemspec +1 -0
- data/spec/code.html +38 -0
- data/spec/code_spec.rb +25 -0
- data/spec/code_with_language.markdown +40 -0
- data/spec/code_without_language.markdown +40 -0
- data/spec/doc_spec.rb +1 -0
- data/spec/nesting_spec.rb +1 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/table_spec.rb +1 -0
- data/spec/tag_spec.rb +2 -1
- metadata +30 -4
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Markitdown [](http://travis-ci.org/cpetersen/markitdown)
|
1
|
+
# Markitdown [](http://travis-ci.org/cpetersen/markitdown) [](https://coveralls.io/r/cpetersen/markitdown?branch=master)
|
2
2
|
|
3
3
|
Markitdown is a Ruby library that converts HTML to Markdown. It's powered by Nokogiri. It supports:
|
4
4
|
|
data/lib/markitdown.rb
CHANGED
@@ -4,20 +4,20 @@ require "markitdown/version"
|
|
4
4
|
require "nokogiri"
|
5
5
|
|
6
6
|
module Markitdown
|
7
|
-
def self.from_html(html)
|
8
|
-
from_nokogiri(Nokogiri::XML(html).root)
|
7
|
+
def self.from_html(html, language_classifier=nil)
|
8
|
+
from_nokogiri(Nokogiri::XML(html).root, language_classifier)
|
9
9
|
end
|
10
10
|
|
11
|
-
def self.from_nokogiri(node)
|
11
|
+
def self.from_nokogiri(node, language_classifier=nil)
|
12
12
|
# gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
|
13
13
|
# gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
|
14
14
|
# gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
|
15
15
|
# gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
|
16
|
-
self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
|
16
|
+
self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
|
17
17
|
end
|
18
18
|
|
19
19
|
private
|
20
|
-
def self.parse_node(node, states=[])
|
20
|
+
def self.parse_node(node, states=[], language_classifier=nil)
|
21
21
|
results=[]
|
22
22
|
after = nil
|
23
23
|
states.unshift node.name.downcase
|
@@ -141,7 +141,13 @@ module Markitdown
|
|
141
141
|
results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
|
142
142
|
when "code"
|
143
143
|
if node.text.include?("\n")
|
144
|
-
|
144
|
+
text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
|
145
|
+
if language_classifier
|
146
|
+
language = language_classifier.classify(text)
|
147
|
+
results << "\n\n```#{language}\n#{text}\n```\n\n"
|
148
|
+
else
|
149
|
+
results << "\n\n```\n#{text}\n```\n\n"
|
150
|
+
end
|
145
151
|
else
|
146
152
|
results << " `#{node.text}` "
|
147
153
|
end
|
@@ -171,7 +177,7 @@ module Markitdown
|
|
171
177
|
|
172
178
|
if recurse
|
173
179
|
node.children.each do |child|
|
174
|
-
contents = self.parse_node(child, states)
|
180
|
+
contents = self.parse_node(child, states, language_classifier)
|
175
181
|
contents = contents.flatten.compact.join.strip if strip_content
|
176
182
|
contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
|
177
183
|
results << contents
|
data/lib/markitdown/version.rb
CHANGED
data/markitdown.gemspec
CHANGED
@@ -11,6 +11,7 @@ Gem::Specification.new do |gem|
|
|
11
11
|
gem.add_dependency('nokogiri')
|
12
12
|
gem.add_development_dependency('rake')
|
13
13
|
gem.add_development_dependency('rspec')
|
14
|
+
gem.add_development_dependency('coveralls')
|
14
15
|
|
15
16
|
gem.files = `git ls-files`.split($\)
|
16
17
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
data/spec/code.html
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
<div>
|
2
|
+
This is an html block
|
3
|
+
<code>
|
4
|
+
<table>
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
|
+
</table>
|
25
|
+
</code>
|
26
|
+
|
27
|
+
This is a ruby block
|
28
|
+
<code>
|
29
|
+
# GET /blogs/1
|
30
|
+
# GET /blogs/1.json
|
31
|
+
def show
|
32
|
+
respond_to do |format|
|
33
|
+
format.html # show.html.erb
|
34
|
+
format.json { render json: @blog }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
</code>
|
38
|
+
</div>
|
data/spec/code_spec.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'markitdown'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Markitdown do
|
5
|
+
context "When parsing codeblocks" do
|
6
|
+
let(:html) { File.read("spec/code.html") }
|
7
|
+
|
8
|
+
context "and not guessing the language" do
|
9
|
+
let(:markdown) { File.read("spec/code_without_language.markdown") }
|
10
|
+
|
11
|
+
it "should produce valid markdown" do
|
12
|
+
Markitdown.from_html(html).should == markdown
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
context "and guessing the language" do
|
17
|
+
let(:markdown) { File.read("spec/code_with_language.markdown") }
|
18
|
+
let(:classifier) { TestLanguageClassifier.new }
|
19
|
+
|
20
|
+
it "should produce valid markdown" do
|
21
|
+
Markitdown.from_html(html, classifier).should == markdown
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
This is an html block
|
2
|
+
|
3
|
+
```html
|
4
|
+
<table>
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
|
+
</table>
|
25
|
+
```
|
26
|
+
|
27
|
+
This is a ruby block
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
# GET /blogs/1
|
31
|
+
# GET /blogs/1.json
|
32
|
+
def show
|
33
|
+
respond_to do |format|
|
34
|
+
format.html # show.html.erb
|
35
|
+
format.json { render json: @blog }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
```
|
39
|
+
|
40
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
This is an html block
|
2
|
+
|
3
|
+
```
|
4
|
+
<table>
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
|
+
</table>
|
25
|
+
```
|
26
|
+
|
27
|
+
This is a ruby block
|
28
|
+
|
29
|
+
```
|
30
|
+
# GET /blogs/1
|
31
|
+
# GET /blogs/1.json
|
32
|
+
def show
|
33
|
+
respond_to do |format|
|
34
|
+
format.html # show.html.erb
|
35
|
+
format.json { render json: @blog }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
```
|
39
|
+
|
40
|
+
|
data/spec/doc_spec.rb
CHANGED
data/spec/nesting_spec.rb
CHANGED
data/spec/spec_helper.rb
ADDED
data/spec/table_spec.rb
CHANGED
data/spec/tag_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'markitdown'
|
2
|
+
require 'spec_helper'
|
2
3
|
|
3
4
|
describe Markitdown do
|
4
5
|
context "When parsing a paragraph" do
|
@@ -166,7 +167,7 @@ describe Markitdown do
|
|
166
167
|
|
167
168
|
it "should return valid markdown with spaces" do
|
168
169
|
pending "Still need to figure out leading spaces for <sup> elements"
|
169
|
-
Markitdown.from_html(html).should == "This ^(is a) test"
|
170
|
+
# Markitdown.from_html(html).should == "This ^(is a) test"
|
170
171
|
end
|
171
172
|
end
|
172
173
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -59,6 +59,22 @@ dependencies:
|
|
59
59
|
- - ! '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: coveralls
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
62
78
|
description: A library that uses Nokogiri to parse HTML and produce Markdown
|
63
79
|
email:
|
64
80
|
- christopher.petersen@gmail.com
|
@@ -77,12 +93,17 @@ files:
|
|
77
93
|
- markitdown.gemspec
|
78
94
|
- spec/asmartbear.html
|
79
95
|
- spec/asmartbear.markdown
|
96
|
+
- spec/code.html
|
97
|
+
- spec/code_spec.rb
|
98
|
+
- spec/code_with_language.markdown
|
99
|
+
- spec/code_without_language.markdown
|
80
100
|
- spec/doc.html
|
81
101
|
- spec/doc.markdown
|
82
102
|
- spec/doc_spec.rb
|
83
103
|
- spec/evernote.markdown
|
84
104
|
- spec/evernote.xml
|
85
105
|
- spec/nesting_spec.rb
|
106
|
+
- spec/spec_helper.rb
|
86
107
|
- spec/table.html
|
87
108
|
- spec/table.markdown
|
88
109
|
- spec/table2.html
|
@@ -103,7 +124,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
124
|
version: '0'
|
104
125
|
segments:
|
105
126
|
- 0
|
106
|
-
hash: -
|
127
|
+
hash: -1182280358588269194
|
107
128
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
129
|
none: false
|
109
130
|
requirements:
|
@@ -112,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
133
|
version: '0'
|
113
134
|
segments:
|
114
135
|
- 0
|
115
|
-
hash: -
|
136
|
+
hash: -1182280358588269194
|
116
137
|
requirements: []
|
117
138
|
rubyforge_project:
|
118
139
|
rubygems_version: 1.8.23
|
@@ -122,12 +143,17 @@ summary: Converts HTML to Markdown
|
|
122
143
|
test_files:
|
123
144
|
- spec/asmartbear.html
|
124
145
|
- spec/asmartbear.markdown
|
146
|
+
- spec/code.html
|
147
|
+
- spec/code_spec.rb
|
148
|
+
- spec/code_with_language.markdown
|
149
|
+
- spec/code_without_language.markdown
|
125
150
|
- spec/doc.html
|
126
151
|
- spec/doc.markdown
|
127
152
|
- spec/doc_spec.rb
|
128
153
|
- spec/evernote.markdown
|
129
154
|
- spec/evernote.xml
|
130
155
|
- spec/nesting_spec.rb
|
156
|
+
- spec/spec_helper.rb
|
131
157
|
- spec/table.html
|
132
158
|
- spec/table.markdown
|
133
159
|
- spec/table2.html
|