markitdown 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/markitdown.rb +5 -3
- data/lib/markitdown/version.rb +1 -1
- data/spec/table.markdown +3 -3
- data/spec/table2.html +68 -41
- data/spec/table2.markdown +13 -6
- metadata +3 -3
data/lib/markitdown.rb
CHANGED
@@ -136,17 +136,19 @@ module Markitdown
|
|
136
136
|
after = "\n\n"
|
137
137
|
when "th"
|
138
138
|
results << "|"
|
139
|
+
strip_content = true
|
139
140
|
flatten_content = true
|
140
141
|
when "td"
|
141
142
|
results << "|"
|
143
|
+
strip_content = true
|
142
144
|
flatten_content = true
|
143
145
|
when "tr"
|
144
146
|
after = "|\n"
|
145
147
|
table = find_parent(node.parent, "table")
|
146
148
|
if table
|
147
|
-
first_row = table.xpath("
|
149
|
+
first_row = table.xpath(".//tr").first
|
148
150
|
if first_row == node
|
149
|
-
cell_count = node.xpath("
|
151
|
+
cell_count = node.xpath(".//th|td").count
|
150
152
|
after << ("|---"*cell_count) + "|\n"
|
151
153
|
end
|
152
154
|
end
|
@@ -156,7 +158,7 @@ module Markitdown
|
|
156
158
|
node.children.each do |child|
|
157
159
|
contents = self.parse_node(child, states)
|
158
160
|
contents = contents.flatten.compact.join.strip if strip_content
|
159
|
-
contents = contents.flatten.compact.join.gsub("\n", " ") if flatten_content
|
161
|
+
contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
|
160
162
|
results << contents
|
161
163
|
end
|
162
164
|
end
|
data/lib/markitdown/version.rb
CHANGED
data/spec/table.markdown
CHANGED
@@ -11,9 +11,9 @@ This [announcement](http://www.google.com) would be followed by instructions.
|
|
11
11
|
|This|is|a|table|
|
12
12
|
|---|---|---|---|
|
13
13
|
|This|is|a|
|
14
|
-
|This|is|a|
|
15
|
-
|This|is|a|
|
16
|
-
|This|is|a
|
14
|
+
|This|is|a|[announcement](http://www.google.com)|
|
15
|
+
|This|is|a|_table_|
|
16
|
+
|This|is|a|**table**|
|
17
17
|
|This|is|a|table|
|
18
18
|
|This|is|a|table|
|
19
19
|
|
data/spec/table2.html
CHANGED
@@ -9,46 +9,73 @@
|
|
9
9
|
<p>This <a href="http://www.google.com">announcement</a> would be followed by instructions.</p>
|
10
10
|
|
11
11
|
<table>
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
12
|
+
<tr>
|
13
|
+
<th>This <br/></th>
|
14
|
+
<th><p>is</p></th>
|
15
|
+
<th>a</th>
|
16
|
+
<th>table</th>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>This</td>
|
20
|
+
<td>is</td>
|
21
|
+
<td>a</td>
|
22
|
+
</tr>
|
23
|
+
<tr>
|
24
|
+
<td>This</td>
|
25
|
+
<td>is</td>
|
26
|
+
<td>a</td>
|
27
|
+
<td><a href="http://www.google.com">announcement</a></td>
|
28
|
+
</tr>
|
29
|
+
<tr>
|
30
|
+
<td>This</td>
|
31
|
+
<td>is</td>
|
32
|
+
<td>a</td>
|
33
|
+
<td><em>table</em></td>
|
34
|
+
</tr>
|
35
|
+
<tr>
|
36
|
+
<td>This</td>
|
37
|
+
<td>is</td>
|
38
|
+
<td>a</td>
|
39
|
+
<td><strong>table</strong></td>
|
40
|
+
</tr>
|
41
|
+
<tr>
|
42
|
+
<td>This</td>
|
43
|
+
<td>is</td>
|
44
|
+
<td>a</td>
|
45
|
+
<td>table <br/></td>
|
46
|
+
</tr>
|
47
|
+
<tr>
|
48
|
+
<td>This</td>
|
49
|
+
<td><p>is</p></td>
|
50
|
+
<td><p>a<br/></p><br/></td>
|
51
|
+
<td>table</td>
|
52
|
+
</tr>
|
53
|
+
</table>
|
54
|
+
|
55
|
+
<p>This paragraph is just to break up the tables</p>
|
56
|
+
|
57
|
+
<table>
|
58
|
+
<tr>
|
59
|
+
<th>This</th>
|
60
|
+
<th>is</th>
|
61
|
+
<th>a</th>
|
62
|
+
<th>second</th>
|
63
|
+
<th>table</th>
|
64
|
+
</tr>
|
65
|
+
<tr>
|
66
|
+
<td>This</td>
|
67
|
+
<td>is</td>
|
68
|
+
<td>a</td>
|
69
|
+
<th>second</th>
|
70
|
+
<th>table</th>
|
71
|
+
</tr>
|
72
|
+
<tr>
|
73
|
+
<td>This</td>
|
74
|
+
<td>is</td>
|
75
|
+
<td>a</td>
|
76
|
+
<td><a href="http://www.google.com">announcement</a></td>
|
77
|
+
<th>second</th>
|
78
|
+
</tr>
|
79
|
+
</table>
|
53
80
|
</body>
|
54
81
|
</html>
|
data/spec/table2.markdown
CHANGED
@@ -8,12 +8,19 @@ This is only a test
|
|
8
8
|
|
9
9
|
This [announcement](http://www.google.com) would be followed by instructions.
|
10
10
|
|
11
|
-
|This
|
11
|
+
|This|is|a|table|
|
12
12
|
|---|---|---|---|
|
13
13
|
|This|is|a|
|
14
|
-
|This|is|a|
|
15
|
-
|This|is|a|
|
16
|
-
|This|is|a
|
17
|
-
|This|is|a|table
|
18
|
-
|This|
|
14
|
+
|This|is|a|[announcement](http://www.google.com)|
|
15
|
+
|This|is|a|_table_|
|
16
|
+
|This|is|a|**table**|
|
17
|
+
|This|is|a|table|
|
18
|
+
|This|is|a|table|
|
19
|
+
|
20
|
+
This paragraph is just to break up the tables
|
21
|
+
|
22
|
+
|This|is|a|second|table|
|
23
|
+
|---|---|---|---|---|
|
24
|
+
|This|is|a|second|table|
|
25
|
+
|This|is|a|[announcement](http://www.google.com)|second|
|
19
26
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -103,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
103
|
version: '0'
|
104
104
|
segments:
|
105
105
|
- 0
|
106
|
-
hash:
|
106
|
+
hash: 3593533597031584594
|
107
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
108
|
none: false
|
109
109
|
requirements:
|
@@ -112,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
112
|
version: '0'
|
113
113
|
segments:
|
114
114
|
- 0
|
115
|
-
hash:
|
115
|
+
hash: 3593533597031584594
|
116
116
|
requirements: []
|
117
117
|
rubyforge_project:
|
118
118
|
rubygems_version: 1.8.23
|