markitdown 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/markitdown.rb +5 -3
- data/lib/markitdown/version.rb +1 -1
- data/spec/table.markdown +3 -3
- data/spec/table2.html +68 -41
- data/spec/table2.markdown +13 -6
- metadata +3 -3
data/lib/markitdown.rb
CHANGED
@@ -136,17 +136,19 @@ module Markitdown
|
|
136
136
|
after = "\n\n"
|
137
137
|
when "th"
|
138
138
|
results << "|"
|
139
|
+
strip_content = true
|
139
140
|
flatten_content = true
|
140
141
|
when "td"
|
141
142
|
results << "|"
|
143
|
+
strip_content = true
|
142
144
|
flatten_content = true
|
143
145
|
when "tr"
|
144
146
|
after = "|\n"
|
145
147
|
table = find_parent(node.parent, "table")
|
146
148
|
if table
|
147
|
-
first_row = table.xpath("
|
149
|
+
first_row = table.xpath(".//tr").first
|
148
150
|
if first_row == node
|
149
|
-
cell_count = node.xpath("
|
151
|
+
cell_count = node.xpath(".//th|td").count
|
150
152
|
after << ("|---"*cell_count) + "|\n"
|
151
153
|
end
|
152
154
|
end
|
@@ -156,7 +158,7 @@ module Markitdown
|
|
156
158
|
node.children.each do |child|
|
157
159
|
contents = self.parse_node(child, states)
|
158
160
|
contents = contents.flatten.compact.join.strip if strip_content
|
159
|
-
contents = contents.flatten.compact.join.gsub("\n", " ") if flatten_content
|
161
|
+
contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
|
160
162
|
results << contents
|
161
163
|
end
|
162
164
|
end
|
data/lib/markitdown/version.rb
CHANGED
data/spec/table.markdown
CHANGED
@@ -11,9 +11,9 @@ This [announcement](http://www.google.com) would be followed by instructions.
|
|
11
11
|
|This|is|a|table|
|
12
12
|
|---|---|---|---|
|
13
13
|
|This|is|a|
|
14
|
-
|This|is|a|
|
15
|
-
|This|is|a|
|
16
|
-
|This|is|a
|
14
|
+
|This|is|a|[announcement](http://www.google.com)|
|
15
|
+
|This|is|a|_table_|
|
16
|
+
|This|is|a|**table**|
|
17
17
|
|This|is|a|table|
|
18
18
|
|This|is|a|table|
|
19
19
|
|
data/spec/table2.html
CHANGED
@@ -9,46 +9,73 @@
|
|
9
9
|
<p>This <a href="http://www.google.com">announcement</a> would be followed by instructions.</p>
|
10
10
|
|
11
11
|
<table>
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
12
|
+
<tr>
|
13
|
+
<th>This <br/></th>
|
14
|
+
<th><p>is</p></th>
|
15
|
+
<th>a</th>
|
16
|
+
<th>table</th>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>This</td>
|
20
|
+
<td>is</td>
|
21
|
+
<td>a</td>
|
22
|
+
</tr>
|
23
|
+
<tr>
|
24
|
+
<td>This</td>
|
25
|
+
<td>is</td>
|
26
|
+
<td>a</td>
|
27
|
+
<td><a href="http://www.google.com">announcement</a></td>
|
28
|
+
</tr>
|
29
|
+
<tr>
|
30
|
+
<td>This</td>
|
31
|
+
<td>is</td>
|
32
|
+
<td>a</td>
|
33
|
+
<td><em>table</em></td>
|
34
|
+
</tr>
|
35
|
+
<tr>
|
36
|
+
<td>This</td>
|
37
|
+
<td>is</td>
|
38
|
+
<td>a</td>
|
39
|
+
<td><strong>table</strong></td>
|
40
|
+
</tr>
|
41
|
+
<tr>
|
42
|
+
<td>This</td>
|
43
|
+
<td>is</td>
|
44
|
+
<td>a</td>
|
45
|
+
<td>table <br/></td>
|
46
|
+
</tr>
|
47
|
+
<tr>
|
48
|
+
<td>This</td>
|
49
|
+
<td><p>is</p></td>
|
50
|
+
<td><p>a<br/></p><br/></td>
|
51
|
+
<td>table</td>
|
52
|
+
</tr>
|
53
|
+
</table>
|
54
|
+
|
55
|
+
<p>This paragraph is just to break up the tables</p>
|
56
|
+
|
57
|
+
<table>
|
58
|
+
<tr>
|
59
|
+
<th>This</th>
|
60
|
+
<th>is</th>
|
61
|
+
<th>a</th>
|
62
|
+
<th>second</th>
|
63
|
+
<th>table</th>
|
64
|
+
</tr>
|
65
|
+
<tr>
|
66
|
+
<td>This</td>
|
67
|
+
<td>is</td>
|
68
|
+
<td>a</td>
|
69
|
+
<th>second</th>
|
70
|
+
<th>table</th>
|
71
|
+
</tr>
|
72
|
+
<tr>
|
73
|
+
<td>This</td>
|
74
|
+
<td>is</td>
|
75
|
+
<td>a</td>
|
76
|
+
<td><a href="http://www.google.com">announcement</a></td>
|
77
|
+
<th>second</th>
|
78
|
+
</tr>
|
79
|
+
</table>
|
53
80
|
</body>
|
54
81
|
</html>
|
data/spec/table2.markdown
CHANGED
@@ -8,12 +8,19 @@ This is only a test
|
|
8
8
|
|
9
9
|
This [announcement](http://www.google.com) would be followed by instructions.
|
10
10
|
|
11
|
-
|This
|
11
|
+
|This|is|a|table|
|
12
12
|
|---|---|---|---|
|
13
13
|
|This|is|a|
|
14
|
-
|This|is|a|
|
15
|
-
|This|is|a|
|
16
|
-
|This|is|a
|
17
|
-
|This|is|a|table
|
18
|
-
|This|
|
14
|
+
|This|is|a|[announcement](http://www.google.com)|
|
15
|
+
|This|is|a|_table_|
|
16
|
+
|This|is|a|**table**|
|
17
|
+
|This|is|a|table|
|
18
|
+
|This|is|a|table|
|
19
|
+
|
20
|
+
This paragraph is just to break up the tables
|
21
|
+
|
22
|
+
|This|is|a|second|table|
|
23
|
+
|---|---|---|---|---|
|
24
|
+
|This|is|a|second|table|
|
25
|
+
|This|is|a|[announcement](http://www.google.com)|second|
|
19
26
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -103,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
103
|
version: '0'
|
104
104
|
segments:
|
105
105
|
- 0
|
106
|
-
hash:
|
106
|
+
hash: 3593533597031584594
|
107
107
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
108
|
none: false
|
109
109
|
requirements:
|
@@ -112,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
112
|
version: '0'
|
113
113
|
segments:
|
114
114
|
- 0
|
115
|
-
hash:
|
115
|
+
hash: 3593533597031584594
|
116
116
|
requirements: []
|
117
117
|
rubyforge_project:
|
118
118
|
rubygems_version: 1.8.23
|