markitdown 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@ module Markitdown
13
13
  # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
14
14
  # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
15
15
  # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
16
- self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
16
+ self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1").gsub(/\u00a0/, " ")
17
17
  end
18
18
 
19
19
  private
@@ -141,7 +141,7 @@ module Markitdown
141
141
  results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
142
142
  when "code"
143
143
  if node.text.include?("\n")
144
- text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
144
+ text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"").gsub(/\u00a0/, " ")
145
145
  if language_classifier
146
146
  language = language_classifier.classify(text)
147
147
  results << "\n\n```#{language}\n#{text}\n```\n\n"
@@ -1,3 +1,3 @@
1
1
  module Markitdown
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.1"
3
3
  end
@@ -22,4 +22,13 @@ describe Markitdown do
22
22
  end
23
23
  end
24
24
  end
25
+
26
+ context "When parsing codeblocks that contain nbsp" do
27
+ let(:html) { File.read("spec/code_with_nbsp.html") }
28
+ let(:markdown) { File.read("spec/code_with_nbsp.markdown") }
29
+
30
+ it "should produce valid markdown" do
31
+ Markitdown.from_html(html).should == markdown
32
+ end
33
+ end
25
34
  end
@@ -2,25 +2,25 @@
2
2
 
3
3
  ```html
4
4
  <table>
5
-   <thead>
6
-     <tr>
7
-       <th>Column 1</th>
8
-       <th>Column 2</th>
9
-       <th>Column 3</th>
10
-     </tr>
11
-   </thead>
12
-   <tbody>
13
-     <tr>
14
-       <td>Value 1</td>
15
-       <td>Value 2</td>
16
-       <td>Value 3</td>
17
-     </tr>
18
-     <tr>
19
-       <td>Value 1a</td>
20
-       <td>Value 2a</td>
21
-       <td>Value 3a</td>
22
-     </tr>
23
-   </tbody>
5
+ <thead>
6
+ <tr>
7
+ <th>Column 1</th>
8
+ <th>Column 2</th>
9
+ <th>Column 3</th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr>
14
+ <td>Value 1</td>
15
+ <td>Value 2</td>
16
+ <td>Value 3</td>
17
+ </tr>
18
+ <tr>
19
+ <td>Value 1a</td>
20
+ <td>Value 2a</td>
21
+ <td>Value 3a</td>
22
+ </tr>
23
+ </tbody>
24
24
  </table>
25
25
  ```
26
26
 
@@ -0,0 +1,38 @@
1
+ <?xml version="1.0"?>
2
+ <div>
3
+ <div>This is a&#xA0;test&#xA0;this&#xA0;is only a test<br/></div>
4
+ <div>
5
+ <br/>
6
+ </div>
7
+ <code>&lt;table&gt;
8
+ &#xA0; &lt;thead&gt;
9
+ &#xA0; &#xA0; &lt;tr&#xA0;class="row"&gt;
10
+ &#xA0; &#xA0; &#xA0; &lt;th class="header"&gt;Column 1&lt;/th&gt;
11
+ &#xA0; &#xA0; &#xA0; &lt;th&#xA0;class="header"&gt;Column 2&lt;/th&gt;
12
+ &#xA0; &#xA0; &#xA0; &lt;th&#xA0;class="header"&gt;Column 3&lt;/th&gt;
13
+ &#xA0; &#xA0; &lt;/tr&gt;
14
+ &#xA0; &lt;/thead&gt;
15
+ &#xA0; &lt;tbody&gt;
16
+
17
+ &#xA0; &#xA0; &lt;tr class="row"&gt;
18
+
19
+ &#xA0; &#xA0; &#xA0; &lt;td class="cell"&gt;Value 1&lt;/td&gt;
20
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 2&lt;/td&gt;
21
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 3&lt;/td&gt;
22
+ &#xA0; &#xA0; &lt;/tr&gt;
23
+
24
+ &#xA0; &#xA0; &lt;tr
25
+ &#xA0;class="row3"&gt;
26
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 1a&lt;/td&gt;
27
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 2a&lt;/td&gt;
28
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 3a&lt;/td&gt;
29
+ &#xA0; &#xA0; &lt;/tr&gt;
30
+ &#xA0; &lt;/tbody&gt;
31
+ &lt;/table&gt;</code>
32
+ <div>
33
+ <br/>
34
+ </div>
35
+ <div>
36
+ <br/>
37
+ </div>
38
+ </div>
@@ -0,0 +1,28 @@
1
+ This is a test this is only a test
2
+
3
+ ```
4
+ <table>
5
+ <thead>
6
+ <tr class="row">
7
+ <th class="header">Column 1</th>
8
+ <th class="header">Column 2</th>
9
+ <th class="header">Column 3</th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr class="row">
14
+ <td class="cell">Value 1</td>
15
+ <td class="cell">Value 2</td>
16
+ <td class="cell">Value 3</td>
17
+ </tr>
18
+ <tr
19
+ class="row3">
20
+ <td class="cell">Value 1a</td>
21
+ <td class="cell">Value 2a</td>
22
+ <td class="cell">Value 3a</td>
23
+ </tr>
24
+ </tbody>
25
+ </table>
26
+ ```
27
+
28
+
@@ -2,25 +2,25 @@
2
2
 
3
3
  ```
4
4
  <table>
5
-   <thead>
6
-     <tr>
7
-       <th>Column 1</th>
8
-       <th>Column 2</th>
9
-       <th>Column 3</th>
10
-     </tr>
11
-   </thead>
12
-   <tbody>
13
-     <tr>
14
-       <td>Value 1</td>
15
-       <td>Value 2</td>
16
-       <td>Value 3</td>
17
-     </tr>
18
-     <tr>
19
-       <td>Value 1a</td>
20
-       <td>Value 2a</td>
21
-       <td>Value 3a</td>
22
-     </tr>
23
-   </tbody>
5
+ <thead>
6
+ <tr>
7
+ <th>Column 1</th>
8
+ <th>Column 2</th>
9
+ <th>Column 3</th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr>
14
+ <td>Value 1</td>
15
+ <td>Value 2</td>
16
+ <td>Value 3</td>
17
+ </tr>
18
+ <tr>
19
+ <td>Value 1a</td>
20
+ <td>Value 2a</td>
21
+ <td>Value 3a</td>
22
+ </tr>
23
+ </tbody>
24
24
  </table>
25
25
  ```
26
26
 
@@ -0,0 +1,37 @@
1
+ <div>
2
+ <div>This is a&#xA0;test&#xA0;this&#xA0;is only a test<br/></div>
3
+ <div>
4
+ <br/>
5
+ </div>
6
+ <code>&lt;table&gt;
7
+ &#xA0; &lt;thead&gt;
8
+ &#xA0; &#xA0; &lt;tr&#xA0;class="row"&gt;
9
+ &#xA0; &#xA0; &#xA0; &lt;th class="header"&gt;Column 1&lt;/th&gt;
10
+ &#xA0; &#xA0; &#xA0; &lt;th&#xA0;class="header"&gt;Column 2&lt;/th&gt;
11
+ &#xA0; &#xA0; &#xA0; &lt;th&#xA0;class="header"&gt;Column 3&lt;/th&gt;
12
+ &#xA0; &#xA0; &lt;/tr&gt;
13
+ &#xA0; &lt;/thead&gt;
14
+ &#xA0; &lt;tbody&gt;
15
+
16
+ &#xA0; &#xA0; &lt;tr class="row"&gt;
17
+
18
+ &#xA0; &#xA0; &#xA0; &lt;td class="cell"&gt;Value 1&lt;/td&gt;
19
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 2&lt;/td&gt;
20
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 3&lt;/td&gt;
21
+ &#xA0; &#xA0; &lt;/tr&gt;
22
+
23
+ &#xA0; &#xA0; &lt;tr
24
+ &#xA0;class="row3"&gt;
25
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 1a&lt;/td&gt;
26
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 2a&lt;/td&gt;
27
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 3a&lt;/td&gt;
28
+ &#xA0; &#xA0; &lt;/tr&gt;
29
+ &#xA0; &lt;/tbody&gt;
30
+ &lt;/table&gt;</code>
31
+ <div>
32
+ <br/>
33
+ </div>
34
+ <div>
35
+ <br/>
36
+ </div>
37
+ </div>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markitdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-18 00:00:00.000000000 Z
12
+ date: 2013-08-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -96,7 +96,10 @@ files:
96
96
  - spec/code.html
97
97
  - spec/code_spec.rb
98
98
  - spec/code_with_language.markdown
99
+ - spec/code_with_nbsp.html
100
+ - spec/code_with_nbsp.markdown
99
101
  - spec/code_without_language.markdown
102
+ - spec/code_wth_nbsp.html
100
103
  - spec/doc.html
101
104
  - spec/doc.markdown
102
105
  - spec/doc_spec.rb
@@ -124,7 +127,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
124
127
  version: '0'
125
128
  segments:
126
129
  - 0
127
- hash: -1182280358588269194
130
+ hash: -237341598759063223
128
131
  required_rubygems_version: !ruby/object:Gem::Requirement
129
132
  none: false
130
133
  requirements:
@@ -133,7 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
133
136
  version: '0'
134
137
  segments:
135
138
  - 0
136
- hash: -1182280358588269194
139
+ hash: -237341598759063223
137
140
  requirements: []
138
141
  rubyforge_project:
139
142
  rubygems_version: 1.8.23
@@ -146,7 +149,10 @@ test_files:
146
149
  - spec/code.html
147
150
  - spec/code_spec.rb
148
151
  - spec/code_with_language.markdown
152
+ - spec/code_with_nbsp.html
153
+ - spec/code_with_nbsp.markdown
149
154
  - spec/code_without_language.markdown
155
+ - spec/code_wth_nbsp.html
150
156
  - spec/doc.html
151
157
  - spec/doc.markdown
152
158
  - spec/doc_spec.rb