markitdown 0.0.10 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -0
- data/lib/markitdown.rb +25 -0
- data/lib/markitdown/version.rb +1 -1
- data/spec/table.html +59 -0
- data/spec/table.markdown +19 -0
- data/spec/table2.html +54 -0
- data/spec/table2.markdown +19 -0
- data/spec/table_spec.rb +19 -0
- metadata +14 -4
data/README.md
CHANGED
data/lib/markitdown.rb
CHANGED
@@ -130,7 +130,25 @@ module Markitdown
|
|
130
130
|
results << " `#{node.text}` "
|
131
131
|
end
|
132
132
|
recurse = false
|
133
|
+
when "table"
|
134
|
+
results << "\n\n"
|
135
|
+
after = "\n\n"
|
136
|
+
when "th"
|
137
|
+
results << "|"
|
138
|
+
when "td"
|
139
|
+
results << "|"
|
140
|
+
when "tr"
|
141
|
+
after = "|\n"
|
142
|
+
table = find_parent(node.parent, "table")
|
143
|
+
if table
|
144
|
+
first_row = table.xpath("//tr").first
|
145
|
+
if first_row == node
|
146
|
+
cell_count = node.xpath("//th|td").count
|
147
|
+
after << ("|---"*cell_count) + "|\n"
|
148
|
+
end
|
149
|
+
end
|
133
150
|
end
|
151
|
+
|
134
152
|
if recurse
|
135
153
|
node.children.each do |child|
|
136
154
|
contents = self.parse_node(child, states)
|
@@ -138,6 +156,7 @@ module Markitdown
|
|
138
156
|
results << contents
|
139
157
|
end
|
140
158
|
end
|
159
|
+
|
141
160
|
if strip_content
|
142
161
|
last_tags = results.pop
|
143
162
|
after = after.flatten.compact.join if after.is_a?(Array)
|
@@ -195,4 +214,10 @@ module Markitdown
|
|
195
214
|
end
|
196
215
|
result
|
197
216
|
end
|
217
|
+
|
218
|
+
def self.find_parent(node, tag_name)
|
219
|
+
return nil unless node
|
220
|
+
return node if node.name == tag_name
|
221
|
+
find_parent(node.parent, tag_name)
|
222
|
+
end
|
198
223
|
end
|
data/lib/markitdown/version.rb
CHANGED
data/spec/table.html
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<h1>This is a test</h1>
|
4
|
+
|
5
|
+
<p>This is only a test</p>
|
6
|
+
|
7
|
+
<h2>Had this been a real test</h2>
|
8
|
+
|
9
|
+
<p>This <a href="http://www.google.com">announcement</a> would be followed by instructions.</p>
|
10
|
+
|
11
|
+
<table>
|
12
|
+
<thead>
|
13
|
+
<tr>
|
14
|
+
<th>This</th>
|
15
|
+
<th>is</th>
|
16
|
+
<th>a</th>
|
17
|
+
<th>table</th>
|
18
|
+
</tr>
|
19
|
+
</thead>
|
20
|
+
<tbody>
|
21
|
+
<tr>
|
22
|
+
<td>This</td>
|
23
|
+
<td>is</td>
|
24
|
+
<td>a</td>
|
25
|
+
</tr>
|
26
|
+
<tr>
|
27
|
+
<td>This</td>
|
28
|
+
<td>is</td>
|
29
|
+
<td>a</td>
|
30
|
+
<td><a href="http://www.google.com">announcement</a></td>
|
31
|
+
</tr>
|
32
|
+
<tr>
|
33
|
+
<td>This</td>
|
34
|
+
<td>is</td>
|
35
|
+
<td>a</td>
|
36
|
+
<td><em>table</em></td>
|
37
|
+
</tr>
|
38
|
+
<tr>
|
39
|
+
<td>This</td>
|
40
|
+
<td>is</td>
|
41
|
+
<td>a</td>
|
42
|
+
<td><strong>table</strong></td>
|
43
|
+
</tr>
|
44
|
+
<tr>
|
45
|
+
<td>This</td>
|
46
|
+
<td>is</td>
|
47
|
+
<td>a</td>
|
48
|
+
<td>table</td>
|
49
|
+
</tr>
|
50
|
+
<tr>
|
51
|
+
<td>This</td>
|
52
|
+
<td>is</td>
|
53
|
+
<td>a</td>
|
54
|
+
<td>table</td>
|
55
|
+
</tr>
|
56
|
+
</tbody>
|
57
|
+
</table>
|
58
|
+
</body>
|
59
|
+
</html>
|
data/spec/table.markdown
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# This is a test
|
4
|
+
|
5
|
+
This is only a test
|
6
|
+
|
7
|
+
## Had this been a real test
|
8
|
+
|
9
|
+
This [announcement](http://www.google.com) would be followed by instructions.
|
10
|
+
|
11
|
+
|This|is|a|table|
|
12
|
+
|---|---|---|---|
|
13
|
+
|This|is|a|
|
14
|
+
|This|is|a| [announcement](http://www.google.com) |
|
15
|
+
|This|is|a| _table_ |
|
16
|
+
|This|is|a| **table** |
|
17
|
+
|This|is|a|table|
|
18
|
+
|This|is|a|table|
|
19
|
+
|
data/spec/table2.html
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<h1>This is a test</h1>
|
4
|
+
|
5
|
+
<p>This is only a test</p>
|
6
|
+
|
7
|
+
<h2>Had this been a real test</h2>
|
8
|
+
|
9
|
+
<p>This <a href="http://www.google.com">announcement</a> would be followed by instructions.</p>
|
10
|
+
|
11
|
+
<table>
|
12
|
+
<tr>
|
13
|
+
<th>This</th>
|
14
|
+
<th>is</th>
|
15
|
+
<th>a</th>
|
16
|
+
<th>table</th>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>This</td>
|
20
|
+
<td>is</td>
|
21
|
+
<td>a</td>
|
22
|
+
</tr>
|
23
|
+
<tr>
|
24
|
+
<td>This</td>
|
25
|
+
<td>is</td>
|
26
|
+
<td>a</td>
|
27
|
+
<td><a href="http://www.google.com">announcement</a></td>
|
28
|
+
</tr>
|
29
|
+
<tr>
|
30
|
+
<td>This</td>
|
31
|
+
<td>is</td>
|
32
|
+
<td>a</td>
|
33
|
+
<td><em>table</em></td>
|
34
|
+
</tr>
|
35
|
+
<tr>
|
36
|
+
<td>This</td>
|
37
|
+
<td>is</td>
|
38
|
+
<td>a</td>
|
39
|
+
<td><strong>table</strong></td>
|
40
|
+
</tr>
|
41
|
+
<tr>
|
42
|
+
<td>This</td>
|
43
|
+
<td>is</td>
|
44
|
+
<td>a</td>
|
45
|
+
<td>table</td>
|
46
|
+
</tr>
|
47
|
+
<tr>
|
48
|
+
<td>This</td>
|
49
|
+
<td>is</td>
|
50
|
+
<td>a</td>
|
51
|
+
<td>table</td>
|
52
|
+
</tr>
|
53
|
+
</body>
|
54
|
+
</html>
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# This is a test
|
4
|
+
|
5
|
+
This is only a test
|
6
|
+
|
7
|
+
## Had this been a real test
|
8
|
+
|
9
|
+
This [announcement](http://www.google.com) would be followed by instructions.
|
10
|
+
|
11
|
+
|This|is|a|table|
|
12
|
+
|---|---|---|---|
|
13
|
+
|This|is|a|
|
14
|
+
|This|is|a| [announcement](http://www.google.com) |
|
15
|
+
|This|is|a| _table_ |
|
16
|
+
|This|is|a| **table** |
|
17
|
+
|This|is|a|table|
|
18
|
+
|This|is|a|table|
|
19
|
+
|
data/spec/table_spec.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'markitdown'
|
2
|
+
|
3
|
+
describe Markitdown do
|
4
|
+
context "When parsing a table with a thead and tbody" do
|
5
|
+
let(:html) { File.read("spec/table.html") }
|
6
|
+
|
7
|
+
it "should produce valid markdown" do
|
8
|
+
Markitdown.from_html(html).should == File.read("spec/table.markdown")
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
context "When parsing a table without a thead and tbody" do
|
13
|
+
let(:html) { File.read("spec/table2.html") }
|
14
|
+
|
15
|
+
it "should produce valid markdown" do
|
16
|
+
Markitdown.from_html(html).should == File.read("spec/table2.markdown")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -83,6 +83,11 @@ files:
|
|
83
83
|
- spec/evernote.markdown
|
84
84
|
- spec/evernote.xml
|
85
85
|
- spec/nesting_spec.rb
|
86
|
+
- spec/table.html
|
87
|
+
- spec/table.markdown
|
88
|
+
- spec/table2.html
|
89
|
+
- spec/table2.markdown
|
90
|
+
- spec/table_spec.rb
|
86
91
|
- spec/tag_spec.rb
|
87
92
|
homepage: https://github.com/cpetersen/markitdown
|
88
93
|
licenses: []
|
@@ -98,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
98
103
|
version: '0'
|
99
104
|
segments:
|
100
105
|
- 0
|
101
|
-
hash:
|
106
|
+
hash: 3407872921255543985
|
102
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
108
|
none: false
|
104
109
|
requirements:
|
@@ -107,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
112
|
version: '0'
|
108
113
|
segments:
|
109
114
|
- 0
|
110
|
-
hash:
|
115
|
+
hash: 3407872921255543985
|
111
116
|
requirements: []
|
112
117
|
rubyforge_project:
|
113
118
|
rubygems_version: 1.8.23
|
@@ -123,4 +128,9 @@ test_files:
|
|
123
128
|
- spec/evernote.markdown
|
124
129
|
- spec/evernote.xml
|
125
130
|
- spec/nesting_spec.rb
|
131
|
+
- spec/table.html
|
132
|
+
- spec/table.markdown
|
133
|
+
- spec/table2.html
|
134
|
+
- spec/table2.markdown
|
135
|
+
- spec/table_spec.rb
|
126
136
|
- spec/tag_spec.rb
|