markitdown 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/markitdown.rb +2 -2
- data/lib/markitdown/version.rb +1 -1
- data/spec/code_spec.rb +9 -0
- data/spec/code_with_language.markdown +19 -19
- data/spec/code_with_nbsp.html +38 -0
- data/spec/code_with_nbsp.markdown +28 -0
- data/spec/code_without_language.markdown +19 -19
- data/spec/code_wth_nbsp.html +37 -0
- metadata +10 -4
data/lib/markitdown.rb
CHANGED
@@ -13,7 +13,7 @@ module Markitdown
|
|
13
13
|
# gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
|
14
14
|
# gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
|
15
15
|
# gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
|
16
|
-
self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
|
16
|
+
self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1").gsub(/\u00a0/, " ")
|
17
17
|
end
|
18
18
|
|
19
19
|
private
|
@@ -141,7 +141,7 @@ module Markitdown
|
|
141
141
|
results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
|
142
142
|
when "code"
|
143
143
|
if node.text.include?("\n")
|
144
|
-
text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
|
144
|
+
text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"").gsub(/\u00a0/, " ")
|
145
145
|
if language_classifier
|
146
146
|
language = language_classifier.classify(text)
|
147
147
|
results << "\n\n```#{language}\n#{text}\n```\n\n"
|
data/lib/markitdown/version.rb
CHANGED
data/spec/code_spec.rb
CHANGED
@@ -22,4 +22,13 @@ describe Markitdown do
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
end
|
25
|
+
|
26
|
+
context "When parsing codeblocks that contain nbsp" do
|
27
|
+
let(:html) { File.read("spec/code_with_nbsp.html") }
|
28
|
+
let(:markdown) { File.read("spec/code_with_nbsp.markdown") }
|
29
|
+
|
30
|
+
it "should produce valid markdown" do
|
31
|
+
Markitdown.from_html(html).should == markdown
|
32
|
+
end
|
33
|
+
end
|
25
34
|
end
|
@@ -2,25 +2,25 @@
|
|
2
2
|
|
3
3
|
```html
|
4
4
|
<table>
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
24
|
</table>
|
25
25
|
```
|
26
26
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<div>
|
3
|
+
<div>This is a test this is only a test<br/></div>
|
4
|
+
<div>
|
5
|
+
<br/>
|
6
|
+
</div>
|
7
|
+
<code><table>
|
8
|
+
  <thead>
|
9
|
+
    <tr class="row">
|
10
|
+
      <th class="header">Column 1</th>
|
11
|
+
      <th class="header">Column 2</th>
|
12
|
+
      <th class="header">Column 3</th>
|
13
|
+
    </tr>
|
14
|
+
  </thead>
|
15
|
+
  <tbody>
|
16
|
+
|
17
|
+
    <tr class="row">
|
18
|
+
|
19
|
+
      <td class="cell">Value 1</td>
|
20
|
+
      <td class="cell">Value 2</td>
|
21
|
+
      <td class="cell">Value 3</td>
|
22
|
+
    </tr>
|
23
|
+
|
24
|
+
    <tr
|
25
|
+
 class="row3">
|
26
|
+
      <td class="cell">Value 1a</td>
|
27
|
+
      <td class="cell">Value 2a</td>
|
28
|
+
      <td class="cell">Value 3a</td>
|
29
|
+
    </tr>
|
30
|
+
  </tbody>
|
31
|
+
</table></code>
|
32
|
+
<div>
|
33
|
+
<br/>
|
34
|
+
</div>
|
35
|
+
<div>
|
36
|
+
<br/>
|
37
|
+
</div>
|
38
|
+
</div>
|
@@ -0,0 +1,28 @@
|
|
1
|
+
This is a test this is only a test
|
2
|
+
|
3
|
+
```
|
4
|
+
<table>
|
5
|
+
<thead>
|
6
|
+
<tr class="row">
|
7
|
+
<th class="header">Column 1</th>
|
8
|
+
<th class="header">Column 2</th>
|
9
|
+
<th class="header">Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr class="row">
|
14
|
+
<td class="cell">Value 1</td>
|
15
|
+
<td class="cell">Value 2</td>
|
16
|
+
<td class="cell">Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr
|
19
|
+
class="row3">
|
20
|
+
<td class="cell">Value 1a</td>
|
21
|
+
<td class="cell">Value 2a</td>
|
22
|
+
<td class="cell">Value 3a</td>
|
23
|
+
</tr>
|
24
|
+
</tbody>
|
25
|
+
</table>
|
26
|
+
```
|
27
|
+
|
28
|
+
|
@@ -2,25 +2,25 @@
|
|
2
2
|
|
3
3
|
```
|
4
4
|
<table>
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
24
|
</table>
|
25
25
|
```
|
26
26
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
<div>
|
2
|
+
<div>This is a test this is only a test<br/></div>
|
3
|
+
<div>
|
4
|
+
<br/>
|
5
|
+
</div>
|
6
|
+
<code><table>
|
7
|
+
  <thead>
|
8
|
+
    <tr class="row">
|
9
|
+
      <th class="header">Column 1</th>
|
10
|
+
      <th class="header">Column 2</th>
|
11
|
+
      <th class="header">Column 3</th>
|
12
|
+
    </tr>
|
13
|
+
  </thead>
|
14
|
+
  <tbody>
|
15
|
+
|
16
|
+
    <tr class="row">
|
17
|
+
|
18
|
+
      <td class="cell">Value 1</td>
|
19
|
+
      <td class="cell">Value 2</td>
|
20
|
+
      <td class="cell">Value 3</td>
|
21
|
+
    </tr>
|
22
|
+
|
23
|
+
    <tr
|
24
|
+
 class="row3">
|
25
|
+
      <td class="cell">Value 1a</td>
|
26
|
+
      <td class="cell">Value 2a</td>
|
27
|
+
      <td class="cell">Value 3a</td>
|
28
|
+
    </tr>
|
29
|
+
  </tbody>
|
30
|
+
</table></code>
|
31
|
+
<div>
|
32
|
+
<br/>
|
33
|
+
</div>
|
34
|
+
<div>
|
35
|
+
<br/>
|
36
|
+
</div>
|
37
|
+
</div>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -96,7 +96,10 @@ files:
|
|
96
96
|
- spec/code.html
|
97
97
|
- spec/code_spec.rb
|
98
98
|
- spec/code_with_language.markdown
|
99
|
+
- spec/code_with_nbsp.html
|
100
|
+
- spec/code_with_nbsp.markdown
|
99
101
|
- spec/code_without_language.markdown
|
102
|
+
- spec/code_wth_nbsp.html
|
100
103
|
- spec/doc.html
|
101
104
|
- spec/doc.markdown
|
102
105
|
- spec/doc_spec.rb
|
@@ -124,7 +127,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
124
127
|
version: '0'
|
125
128
|
segments:
|
126
129
|
- 0
|
127
|
-
hash: -
|
130
|
+
hash: -237341598759063223
|
128
131
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
132
|
none: false
|
130
133
|
requirements:
|
@@ -133,7 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
133
136
|
version: '0'
|
134
137
|
segments:
|
135
138
|
- 0
|
136
|
-
hash: -
|
139
|
+
hash: -237341598759063223
|
137
140
|
requirements: []
|
138
141
|
rubyforge_project:
|
139
142
|
rubygems_version: 1.8.23
|
@@ -146,7 +149,10 @@ test_files:
|
|
146
149
|
- spec/code.html
|
147
150
|
- spec/code_spec.rb
|
148
151
|
- spec/code_with_language.markdown
|
152
|
+
- spec/code_with_nbsp.html
|
153
|
+
- spec/code_with_nbsp.markdown
|
149
154
|
- spec/code_without_language.markdown
|
155
|
+
- spec/code_wth_nbsp.html
|
150
156
|
- spec/doc.html
|
151
157
|
- spec/doc.markdown
|
152
158
|
- spec/doc_spec.rb
|