markitdown 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -136,17 +136,19 @@ module Markitdown
136
136
  after = "\n\n"
137
137
  when "th"
138
138
  results << "|"
139
+ strip_content = true
139
140
  flatten_content = true
140
141
  when "td"
141
142
  results << "|"
143
+ strip_content = true
142
144
  flatten_content = true
143
145
  when "tr"
144
146
  after = "|\n"
145
147
  table = find_parent(node.parent, "table")
146
148
  if table
147
- first_row = table.xpath("//tr").first
149
+ first_row = table.xpath(".//tr").first
148
150
  if first_row == node
149
- cell_count = node.xpath("//th|td").count
151
+ cell_count = node.xpath(".//th|td").count
150
152
  after << ("|---"*cell_count) + "|\n"
151
153
  end
152
154
  end
@@ -156,7 +158,7 @@ module Markitdown
156
158
  node.children.each do |child|
157
159
  contents = self.parse_node(child, states)
158
160
  contents = contents.flatten.compact.join.strip if strip_content
159
- contents = contents.flatten.compact.join.gsub("\n", " ") if flatten_content
161
+ contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
160
162
  results << contents
161
163
  end
162
164
  end
@@ -1,3 +1,3 @@
1
1
  module Markitdown
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -11,9 +11,9 @@ This [announcement](http://www.google.com) would be followed by instructions.
11
11
  |This|is|a|table|
12
12
  |---|---|---|---|
13
13
  |This|is|a|
14
- |This|is|a| [announcement](http://www.google.com) |
15
- |This|is|a| _table_ |
16
- |This|is|a| **table** |
14
+ |This|is|a|[announcement](http://www.google.com)|
15
+ |This|is|a|_table_|
16
+ |This|is|a|**table**|
17
17
  |This|is|a|table|
18
18
  |This|is|a|table|
19
19
 
@@ -9,46 +9,73 @@
9
9
  <p>This <a href="http://www.google.com">announcement</a> would be followed by instructions.</p>
10
10
 
11
11
  <table>
12
- <tr>
13
- <th>This <br/></th>
14
- <th><p>is</p></th>
15
- <th>a</th>
16
- <th>table</th>
17
- </tr>
18
- <tr>
19
- <td>This</td>
20
- <td>is</td>
21
- <td>a</td>
22
- </tr>
23
- <tr>
24
- <td>This</td>
25
- <td>is</td>
26
- <td>a</td>
27
- <td><a href="http://www.google.com">announcement</a></td>
28
- </tr>
29
- <tr>
30
- <td>This</td>
31
- <td>is</td>
32
- <td>a</td>
33
- <td><em>table</em></td>
34
- </tr>
35
- <tr>
36
- <td>This</td>
37
- <td>is</td>
38
- <td>a</td>
39
- <td><strong>table</strong></td>
40
- </tr>
41
- <tr>
42
- <td>This</td>
43
- <td>is</td>
44
- <td>a</td>
45
- <td>table <br/></td>
46
- </tr>
47
- <tr>
48
- <td>This</td>
49
- <td><p>is</p></td>
50
- <td><p>a<br/></p><br/></td>
51
- <td>table</td>
52
- </tr>
12
+ <tr>
13
+ <th>This <br/></th>
14
+ <th><p>is</p></th>
15
+ <th>a</th>
16
+ <th>table</th>
17
+ </tr>
18
+ <tr>
19
+ <td>This</td>
20
+ <td>is</td>
21
+ <td>a</td>
22
+ </tr>
23
+ <tr>
24
+ <td>This</td>
25
+ <td>is</td>
26
+ <td>a</td>
27
+ <td><a href="http://www.google.com">announcement</a></td>
28
+ </tr>
29
+ <tr>
30
+ <td>This</td>
31
+ <td>is</td>
32
+ <td>a</td>
33
+ <td><em>table</em></td>
34
+ </tr>
35
+ <tr>
36
+ <td>This</td>
37
+ <td>is</td>
38
+ <td>a</td>
39
+ <td><strong>table</strong></td>
40
+ </tr>
41
+ <tr>
42
+ <td>This</td>
43
+ <td>is</td>
44
+ <td>a</td>
45
+ <td>table <br/></td>
46
+ </tr>
47
+ <tr>
48
+ <td>This</td>
49
+ <td><p>is</p></td>
50
+ <td><p>a<br/></p><br/></td>
51
+ <td>table</td>
52
+ </tr>
53
+ </table>
54
+
55
+ <p>This paragraph is just to break up the tables</p>
56
+
57
+ <table>
58
+ <tr>
59
+ <th>This</th>
60
+ <th>is</th>
61
+ <th>a</th>
62
+ <th>second</th>
63
+ <th>table</th>
64
+ </tr>
65
+ <tr>
66
+ <td>This</td>
67
+ <td>is</td>
68
+ <td>a</td>
69
+ <th>second</th>
70
+ <th>table</th>
71
+ </tr>
72
+ <tr>
73
+ <td>This</td>
74
+ <td>is</td>
75
+ <td>a</td>
76
+ <td><a href="http://www.google.com">announcement</a></td>
77
+ <th>second</th>
78
+ </tr>
79
+ </table>
53
80
  </body>
54
81
  </html>
@@ -8,12 +8,19 @@ This is only a test
8
8
 
9
9
  This [announcement](http://www.google.com) would be followed by instructions.
10
10
 
11
- |This | is |a|table|
11
+ |This|is|a|table|
12
12
  |---|---|---|---|
13
13
  |This|is|a|
14
- |This|is|a| [announcement](http://www.google.com) |
15
- |This|is|a| _table_ |
16
- |This|is|a| **table** |
17
- |This|is|a|table |
18
- |This| is | a |table|
14
+ |This|is|a|[announcement](http://www.google.com)|
15
+ |This|is|a|_table_|
16
+ |This|is|a|**table**|
17
+ |This|is|a|table|
18
+ |This|is|a|table|
19
+
20
+ This paragraph is just to break up the tables
21
+
22
+ |This|is|a|second|table|
23
+ |---|---|---|---|---|
24
+ |This|is|a|second|table|
25
+ |This|is|a|[announcement](http://www.google.com)|second|
19
26
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markitdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -103,7 +103,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
103
103
  version: '0'
104
104
  segments:
105
105
  - 0
106
- hash: -667411799113990729
106
+ hash: 3593533597031584594
107
107
  required_rubygems_version: !ruby/object:Gem::Requirement
108
108
  none: false
109
109
  requirements:
@@ -112,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
112
  version: '0'
113
113
  segments:
114
114
  - 0
115
- hash: -667411799113990729
115
+ hash: 3593533597031584594
116
116
  requirements: []
117
117
  rubyforge_project:
118
118
  rubygems_version: 1.8.23