markitdown 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,7 +13,7 @@ module Markitdown
13
13
  # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
14
14
  # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
15
15
  # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
16
- self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
16
+ self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1").gsub(/\u00a0/, " ")
17
17
  end
18
18
 
19
19
  private
@@ -141,7 +141,7 @@ module Markitdown
141
141
  results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
142
142
  when "code"
143
143
  if node.text.include?("\n")
144
- text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
144
+ text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"").gsub(/\u00a0/, " ")
145
145
  if language_classifier
146
146
  language = language_classifier.classify(text)
147
147
  results << "\n\n```#{language}\n#{text}\n```\n\n"
@@ -1,3 +1,3 @@
1
1
  module Markitdown
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.1"
3
3
  end
@@ -22,4 +22,13 @@ describe Markitdown do
22
22
  end
23
23
  end
24
24
  end
25
+
26
+ context "When parsing codeblocks that contain nbsp" do
27
+ let(:html) { File.read("spec/code_with_nbsp.html") }
28
+ let(:markdown) { File.read("spec/code_with_nbsp.markdown") }
29
+
30
+ it "should produce valid markdown" do
31
+ Markitdown.from_html(html).should == markdown
32
+ end
33
+ end
25
34
  end
@@ -2,25 +2,25 @@
2
2
 
3
3
  ```html
4
4
  <table>
5
-   <thead>
6
-     <tr>
7
-       <th>Column 1</th>
8
-       <th>Column 2</th>
9
-       <th>Column 3</th>
10
-     </tr>
11
-   </thead>
12
-   <tbody>
13
-     <tr>
14
-       <td>Value 1</td>
15
-       <td>Value 2</td>
16
-       <td>Value 3</td>
17
-     </tr>
18
-     <tr>
19
-       <td>Value 1a</td>
20
-       <td>Value 2a</td>
21
-       <td>Value 3a</td>
22
-     </tr>
23
-   </tbody>
5
+ <thead>
6
+ <tr>
7
+ <th>Column 1</th>
8
+ <th>Column 2</th>
9
+ <th>Column 3</th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr>
14
+ <td>Value 1</td>
15
+ <td>Value 2</td>
16
+ <td>Value 3</td>
17
+ </tr>
18
+ <tr>
19
+ <td>Value 1a</td>
20
+ <td>Value 2a</td>
21
+ <td>Value 3a</td>
22
+ </tr>
23
+ </tbody>
24
24
  </table>
25
25
  ```
26
26
 
@@ -0,0 +1,38 @@
1
+ <?xml version="1.0"?>
2
+ <div>
3
+ <div>This is a&#xA0;test&#xA0;this&#xA0;is only a test<br/></div>
4
+ <div>
5
+ <br/>
6
+ </div>
7
+ <code>&lt;table&gt;
8
+ &#xA0; &lt;thead&gt;
9
+ &#xA0; &#xA0; &lt;tr&#xA0;class="row"&gt;
10
+ &#xA0; &#xA0; &#xA0; &lt;th class="header"&gt;Column 1&lt;/th&gt;
11
+ &#xA0; &#xA0; &#xA0; &lt;th&#xA0;class="header"&gt;Column 2&lt;/th&gt;
12
+ &#xA0; &#xA0; &#xA0; &lt;th&#xA0;class="header"&gt;Column 3&lt;/th&gt;
13
+ &#xA0; &#xA0; &lt;/tr&gt;
14
+ &#xA0; &lt;/thead&gt;
15
+ &#xA0; &lt;tbody&gt;
16
+
17
+ &#xA0; &#xA0; &lt;tr class="row"&gt;
18
+
19
+ &#xA0; &#xA0; &#xA0; &lt;td class="cell"&gt;Value 1&lt;/td&gt;
20
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 2&lt;/td&gt;
21
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 3&lt;/td&gt;
22
+ &#xA0; &#xA0; &lt;/tr&gt;
23
+
24
+ &#xA0; &#xA0; &lt;tr
25
+ &#xA0;class="row3"&gt;
26
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 1a&lt;/td&gt;
27
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 2a&lt;/td&gt;
28
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 3a&lt;/td&gt;
29
+ &#xA0; &#xA0; &lt;/tr&gt;
30
+ &#xA0; &lt;/tbody&gt;
31
+ &lt;/table&gt;</code>
32
+ <div>
33
+ <br/>
34
+ </div>
35
+ <div>
36
+ <br/>
37
+ </div>
38
+ </div>
@@ -0,0 +1,28 @@
1
+ This is a test this is only a test
2
+
3
+ ```
4
+ <table>
5
+ <thead>
6
+ <tr class="row">
7
+ <th class="header">Column 1</th>
8
+ <th class="header">Column 2</th>
9
+ <th class="header">Column 3</th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr class="row">
14
+ <td class="cell">Value 1</td>
15
+ <td class="cell">Value 2</td>
16
+ <td class="cell">Value 3</td>
17
+ </tr>
18
+ <tr
19
+ class="row3">
20
+ <td class="cell">Value 1a</td>
21
+ <td class="cell">Value 2a</td>
22
+ <td class="cell">Value 3a</td>
23
+ </tr>
24
+ </tbody>
25
+ </table>
26
+ ```
27
+
28
+
@@ -2,25 +2,25 @@
2
2
 
3
3
  ```
4
4
  <table>
5
-   <thead>
6
-     <tr>
7
-       <th>Column 1</th>
8
-       <th>Column 2</th>
9
-       <th>Column 3</th>
10
-     </tr>
11
-   </thead>
12
-   <tbody>
13
-     <tr>
14
-       <td>Value 1</td>
15
-       <td>Value 2</td>
16
-       <td>Value 3</td>
17
-     </tr>
18
-     <tr>
19
-       <td>Value 1a</td>
20
-       <td>Value 2a</td>
21
-       <td>Value 3a</td>
22
-     </tr>
23
-   </tbody>
5
+ <thead>
6
+ <tr>
7
+ <th>Column 1</th>
8
+ <th>Column 2</th>
9
+ <th>Column 3</th>
10
+ </tr>
11
+ </thead>
12
+ <tbody>
13
+ <tr>
14
+ <td>Value 1</td>
15
+ <td>Value 2</td>
16
+ <td>Value 3</td>
17
+ </tr>
18
+ <tr>
19
+ <td>Value 1a</td>
20
+ <td>Value 2a</td>
21
+ <td>Value 3a</td>
22
+ </tr>
23
+ </tbody>
24
24
  </table>
25
25
  ```
26
26
 
@@ -0,0 +1,37 @@
1
+ <div>
2
+ <div>This is a&#xA0;test&#xA0;this&#xA0;is only a test<br/></div>
3
+ <div>
4
+ <br/>
5
+ </div>
6
+ <code>&lt;table&gt;
7
+ &#xA0; &lt;thead&gt;
8
+ &#xA0; &#xA0; &lt;tr&#xA0;class="row"&gt;
9
+ &#xA0; &#xA0; &#xA0; &lt;th class="header"&gt;Column 1&lt;/th&gt;
10
+ &#xA0; &#xA0; &#xA0; &lt;th&#xA0;class="header"&gt;Column 2&lt;/th&gt;
11
+ &#xA0; &#xA0; &#xA0; &lt;th&#xA0;class="header"&gt;Column 3&lt;/th&gt;
12
+ &#xA0; &#xA0; &lt;/tr&gt;
13
+ &#xA0; &lt;/thead&gt;
14
+ &#xA0; &lt;tbody&gt;
15
+
16
+ &#xA0; &#xA0; &lt;tr class="row"&gt;
17
+
18
+ &#xA0; &#xA0; &#xA0; &lt;td class="cell"&gt;Value 1&lt;/td&gt;
19
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 2&lt;/td&gt;
20
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 3&lt;/td&gt;
21
+ &#xA0; &#xA0; &lt;/tr&gt;
22
+
23
+ &#xA0; &#xA0; &lt;tr
24
+ &#xA0;class="row3"&gt;
25
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 1a&lt;/td&gt;
26
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 2a&lt;/td&gt;
27
+ &#xA0; &#xA0; &#xA0; &lt;td&#xA0;class="cell"&gt;Value 3a&lt;/td&gt;
28
+ &#xA0; &#xA0; &lt;/tr&gt;
29
+ &#xA0; &lt;/tbody&gt;
30
+ &lt;/table&gt;</code>
31
+ <div>
32
+ <br/>
33
+ </div>
34
+ <div>
35
+ <br/>
36
+ </div>
37
+ </div>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markitdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-18 00:00:00.000000000 Z
12
+ date: 2013-08-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -96,7 +96,10 @@ files:
96
96
  - spec/code.html
97
97
  - spec/code_spec.rb
98
98
  - spec/code_with_language.markdown
99
+ - spec/code_with_nbsp.html
100
+ - spec/code_with_nbsp.markdown
99
101
  - spec/code_without_language.markdown
102
+ - spec/code_wth_nbsp.html
100
103
  - spec/doc.html
101
104
  - spec/doc.markdown
102
105
  - spec/doc_spec.rb
@@ -124,7 +127,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
124
127
  version: '0'
125
128
  segments:
126
129
  - 0
127
- hash: -1182280358588269194
130
+ hash: -237341598759063223
128
131
  required_rubygems_version: !ruby/object:Gem::Requirement
129
132
  none: false
130
133
  requirements:
@@ -133,7 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
133
136
  version: '0'
134
137
  segments:
135
138
  - 0
136
- hash: -1182280358588269194
139
+ hash: -237341598759063223
137
140
  requirements: []
138
141
  rubyforge_project:
139
142
  rubygems_version: 1.8.23
@@ -146,7 +149,10 @@ test_files:
146
149
  - spec/code.html
147
150
  - spec/code_spec.rb
148
151
  - spec/code_with_language.markdown
152
+ - spec/code_with_nbsp.html
153
+ - spec/code_with_nbsp.markdown
149
154
  - spec/code_without_language.markdown
155
+ - spec/code_wth_nbsp.html
150
156
  - spec/doc.html
151
157
  - spec/doc.markdown
152
158
  - spec/doc_spec.rb