markitdown 0.0.10 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -0
- data/lib/markitdown.rb +25 -0
- data/lib/markitdown/version.rb +1 -1
- data/spec/table.html +59 -0
- data/spec/table.markdown +19 -0
- data/spec/table2.html +54 -0
- data/spec/table2.markdown +19 -0
- data/spec/table_spec.rb +19 -0
- metadata +14 -4
data/README.md
CHANGED
data/lib/markitdown.rb
CHANGED
@@ -130,7 +130,25 @@ module Markitdown
|
|
130
130
|
results << " `#{node.text}` "
|
131
131
|
end
|
132
132
|
recurse = false
|
133
|
+
when "table"
|
134
|
+
results << "\n\n"
|
135
|
+
after = "\n\n"
|
136
|
+
when "th"
|
137
|
+
results << "|"
|
138
|
+
when "td"
|
139
|
+
results << "|"
|
140
|
+
when "tr"
|
141
|
+
after = "|\n"
|
142
|
+
table = find_parent(node.parent, "table")
|
143
|
+
if table
|
144
|
+
first_row = table.xpath("//tr").first
|
145
|
+
if first_row == node
|
146
|
+
cell_count = node.xpath("//th|td").count
|
147
|
+
after << ("|---"*cell_count) + "|\n"
|
148
|
+
end
|
149
|
+
end
|
133
150
|
end
|
151
|
+
|
134
152
|
if recurse
|
135
153
|
node.children.each do |child|
|
136
154
|
contents = self.parse_node(child, states)
|
@@ -138,6 +156,7 @@ module Markitdown
|
|
138
156
|
results << contents
|
139
157
|
end
|
140
158
|
end
|
159
|
+
|
141
160
|
if strip_content
|
142
161
|
last_tags = results.pop
|
143
162
|
after = after.flatten.compact.join if after.is_a?(Array)
|
@@ -195,4 +214,10 @@ module Markitdown
|
|
195
214
|
end
|
196
215
|
result
|
197
216
|
end
|
217
|
+
|
218
|
+
def self.find_parent(node, tag_name)
|
219
|
+
return nil unless node
|
220
|
+
return node if node.name == tag_name
|
221
|
+
find_parent(node.parent, tag_name)
|
222
|
+
end
|
198
223
|
end
|
data/lib/markitdown/version.rb
CHANGED
data/spec/table.html
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<h1>This is a test</h1>
|
4
|
+
|
5
|
+
<p>This is only a test</p>
|
6
|
+
|
7
|
+
<h2>Had this been a real test</h2>
|
8
|
+
|
9
|
+
<p>This <a href="http://www.google.com">announcement</a> would be followed by instructions.</p>
|
10
|
+
|
11
|
+
<table>
|
12
|
+
<thead>
|
13
|
+
<tr>
|
14
|
+
<th>This</th>
|
15
|
+
<th>is</th>
|
16
|
+
<th>a</th>
|
17
|
+
<th>table</th>
|
18
|
+
</tr>
|
19
|
+
</thead>
|
20
|
+
<tbody>
|
21
|
+
<tr>
|
22
|
+
<td>This</td>
|
23
|
+
<td>is</td>
|
24
|
+
<td>a</td>
|
25
|
+
</tr>
|
26
|
+
<tr>
|
27
|
+
<td>This</td>
|
28
|
+
<td>is</td>
|
29
|
+
<td>a</td>
|
30
|
+
<td><a href="http://www.google.com">announcement</a></td>
|
31
|
+
</tr>
|
32
|
+
<tr>
|
33
|
+
<td>This</td>
|
34
|
+
<td>is</td>
|
35
|
+
<td>a</td>
|
36
|
+
<td><em>table</em></td>
|
37
|
+
</tr>
|
38
|
+
<tr>
|
39
|
+
<td>This</td>
|
40
|
+
<td>is</td>
|
41
|
+
<td>a</td>
|
42
|
+
<td><strong>table</strong></td>
|
43
|
+
</tr>
|
44
|
+
<tr>
|
45
|
+
<td>This</td>
|
46
|
+
<td>is</td>
|
47
|
+
<td>a</td>
|
48
|
+
<td>table</td>
|
49
|
+
</tr>
|
50
|
+
<tr>
|
51
|
+
<td>This</td>
|
52
|
+
<td>is</td>
|
53
|
+
<td>a</td>
|
54
|
+
<td>table</td>
|
55
|
+
</tr>
|
56
|
+
</tbody>
|
57
|
+
</table>
|
58
|
+
</body>
|
59
|
+
</html>
|
data/spec/table.markdown
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# This is a test
|
4
|
+
|
5
|
+
This is only a test
|
6
|
+
|
7
|
+
## Had this been a real test
|
8
|
+
|
9
|
+
This [announcement](http://www.google.com) would be followed by instructions.
|
10
|
+
|
11
|
+
|This|is|a|table|
|
12
|
+
|---|---|---|---|
|
13
|
+
|This|is|a|
|
14
|
+
|This|is|a| [announcement](http://www.google.com) |
|
15
|
+
|This|is|a| _table_ |
|
16
|
+
|This|is|a| **table** |
|
17
|
+
|This|is|a|table|
|
18
|
+
|This|is|a|table|
|
19
|
+
|
data/spec/table2.html
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
<html>
|
2
|
+
<body>
|
3
|
+
<h1>This is a test</h1>
|
4
|
+
|
5
|
+
<p>This is only a test</p>
|
6
|
+
|
7
|
+
<h2>Had this been a real test</h2>
|
8
|
+
|
9
|
+
<p>This <a href="http://www.google.com">announcement</a> would be followed by instructions.</p>
|
10
|
+
|
11
|
+
<table>
|
12
|
+
<tr>
|
13
|
+
<th>This</th>
|
14
|
+
<th>is</th>
|
15
|
+
<th>a</th>
|
16
|
+
<th>table</th>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>This</td>
|
20
|
+
<td>is</td>
|
21
|
+
<td>a</td>
|
22
|
+
</tr>
|
23
|
+
<tr>
|
24
|
+
<td>This</td>
|
25
|
+
<td>is</td>
|
26
|
+
<td>a</td>
|
27
|
+
<td><a href="http://www.google.com">announcement</a></td>
|
28
|
+
</tr>
|
29
|
+
<tr>
|
30
|
+
<td>This</td>
|
31
|
+
<td>is</td>
|
32
|
+
<td>a</td>
|
33
|
+
<td><em>table</em></td>
|
34
|
+
</tr>
|
35
|
+
<tr>
|
36
|
+
<td>This</td>
|
37
|
+
<td>is</td>
|
38
|
+
<td>a</td>
|
39
|
+
<td><strong>table</strong></td>
|
40
|
+
</tr>
|
41
|
+
<tr>
|
42
|
+
<td>This</td>
|
43
|
+
<td>is</td>
|
44
|
+
<td>a</td>
|
45
|
+
<td>table</td>
|
46
|
+
</tr>
|
47
|
+
<tr>
|
48
|
+
<td>This</td>
|
49
|
+
<td>is</td>
|
50
|
+
<td>a</td>
|
51
|
+
<td>table</td>
|
52
|
+
</tr>
|
53
|
+
</body>
|
54
|
+
</html>
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# This is a test
|
4
|
+
|
5
|
+
This is only a test
|
6
|
+
|
7
|
+
## Had this been a real test
|
8
|
+
|
9
|
+
This [announcement](http://www.google.com) would be followed by instructions.
|
10
|
+
|
11
|
+
|This|is|a|table|
|
12
|
+
|---|---|---|---|
|
13
|
+
|This|is|a|
|
14
|
+
|This|is|a| [announcement](http://www.google.com) |
|
15
|
+
|This|is|a| _table_ |
|
16
|
+
|This|is|a| **table** |
|
17
|
+
|This|is|a|table|
|
18
|
+
|This|is|a|table|
|
19
|
+
|
data/spec/table_spec.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'markitdown'
|
2
|
+
|
3
|
+
describe Markitdown do
|
4
|
+
context "When parsing a table with a thead and tbody" do
|
5
|
+
let(:html) { File.read("spec/table.html") }
|
6
|
+
|
7
|
+
it "should produce valid markdown" do
|
8
|
+
Markitdown.from_html(html).should == File.read("spec/table.markdown")
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
context "When parsing a table without a thead and tbody" do
|
13
|
+
let(:html) { File.read("spec/table2.html") }
|
14
|
+
|
15
|
+
it "should produce valid markdown" do
|
16
|
+
Markitdown.from_html(html).should == File.read("spec/table2.markdown")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -83,6 +83,11 @@ files:
|
|
83
83
|
- spec/evernote.markdown
|
84
84
|
- spec/evernote.xml
|
85
85
|
- spec/nesting_spec.rb
|
86
|
+
- spec/table.html
|
87
|
+
- spec/table.markdown
|
88
|
+
- spec/table2.html
|
89
|
+
- spec/table2.markdown
|
90
|
+
- spec/table_spec.rb
|
86
91
|
- spec/tag_spec.rb
|
87
92
|
homepage: https://github.com/cpetersen/markitdown
|
88
93
|
licenses: []
|
@@ -98,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
98
103
|
version: '0'
|
99
104
|
segments:
|
100
105
|
- 0
|
101
|
-
hash:
|
106
|
+
hash: 3407872921255543985
|
102
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
108
|
none: false
|
104
109
|
requirements:
|
@@ -107,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
112
|
version: '0'
|
108
113
|
segments:
|
109
114
|
- 0
|
110
|
-
hash:
|
115
|
+
hash: 3407872921255543985
|
111
116
|
requirements: []
|
112
117
|
rubyforge_project:
|
113
118
|
rubygems_version: 1.8.23
|
@@ -123,4 +128,9 @@ test_files:
|
|
123
128
|
- spec/evernote.markdown
|
124
129
|
- spec/evernote.xml
|
125
130
|
- spec/nesting_spec.rb
|
131
|
+
- spec/table.html
|
132
|
+
- spec/table.markdown
|
133
|
+
- spec/table2.html
|
134
|
+
- spec/table2.markdown
|
135
|
+
- spec/table_spec.rb
|
126
136
|
- spec/tag_spec.rb
|