markitdown 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/markitdown.rb +2 -2
- data/lib/markitdown/version.rb +1 -1
- data/spec/code_spec.rb +9 -0
- data/spec/code_with_language.markdown +19 -19
- data/spec/code_with_nbsp.html +38 -0
- data/spec/code_with_nbsp.markdown +28 -0
- data/spec/code_without_language.markdown +19 -19
- data/spec/code_wth_nbsp.html +37 -0
- metadata +10 -4
data/lib/markitdown.rb
CHANGED
@@ -13,7 +13,7 @@ module Markitdown
|
|
13
13
|
# gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
|
14
14
|
# gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
|
15
15
|
# gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
|
16
|
-
self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
|
16
|
+
self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1").gsub(/\u00a0/, " ")
|
17
17
|
end
|
18
18
|
|
19
19
|
private
|
@@ -141,7 +141,7 @@ module Markitdown
|
|
141
141
|
results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
|
142
142
|
when "code"
|
143
143
|
if node.text.include?("\n")
|
144
|
-
text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
|
144
|
+
text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"").gsub(/\u00a0/, " ")
|
145
145
|
if language_classifier
|
146
146
|
language = language_classifier.classify(text)
|
147
147
|
results << "\n\n```#{language}\n#{text}\n```\n\n"
|
data/lib/markitdown/version.rb
CHANGED
data/spec/code_spec.rb
CHANGED
@@ -22,4 +22,13 @@ describe Markitdown do
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
end
|
25
|
+
|
26
|
+
context "When parsing codeblocks that contain nbsp" do
|
27
|
+
let(:html) { File.read("spec/code_with_nbsp.html") }
|
28
|
+
let(:markdown) { File.read("spec/code_with_nbsp.markdown") }
|
29
|
+
|
30
|
+
it "should produce valid markdown" do
|
31
|
+
Markitdown.from_html(html).should == markdown
|
32
|
+
end
|
33
|
+
end
|
25
34
|
end
|
@@ -2,25 +2,25 @@
|
|
2
2
|
|
3
3
|
```html
|
4
4
|
<table>
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
24
|
</table>
|
25
25
|
```
|
26
26
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<div>
|
3
|
+
<div>This is a test this is only a test<br/></div>
|
4
|
+
<div>
|
5
|
+
<br/>
|
6
|
+
</div>
|
7
|
+
<code><table>
|
8
|
+
  <thead>
|
9
|
+
    <tr class="row">
|
10
|
+
      <th class="header">Column 1</th>
|
11
|
+
      <th class="header">Column 2</th>
|
12
|
+
      <th class="header">Column 3</th>
|
13
|
+
    </tr>
|
14
|
+
  </thead>
|
15
|
+
  <tbody>
|
16
|
+
|
17
|
+
    <tr class="row">
|
18
|
+
|
19
|
+
      <td class="cell">Value 1</td>
|
20
|
+
      <td class="cell">Value 2</td>
|
21
|
+
      <td class="cell">Value 3</td>
|
22
|
+
    </tr>
|
23
|
+
|
24
|
+
    <tr
|
25
|
+
 class="row3">
|
26
|
+
      <td class="cell">Value 1a</td>
|
27
|
+
      <td class="cell">Value 2a</td>
|
28
|
+
      <td class="cell">Value 3a</td>
|
29
|
+
    </tr>
|
30
|
+
  </tbody>
|
31
|
+
</table></code>
|
32
|
+
<div>
|
33
|
+
<br/>
|
34
|
+
</div>
|
35
|
+
<div>
|
36
|
+
<br/>
|
37
|
+
</div>
|
38
|
+
</div>
|
@@ -0,0 +1,28 @@
|
|
1
|
+
This is a test this is only a test
|
2
|
+
|
3
|
+
```
|
4
|
+
<table>
|
5
|
+
<thead>
|
6
|
+
<tr class="row">
|
7
|
+
<th class="header">Column 1</th>
|
8
|
+
<th class="header">Column 2</th>
|
9
|
+
<th class="header">Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr class="row">
|
14
|
+
<td class="cell">Value 1</td>
|
15
|
+
<td class="cell">Value 2</td>
|
16
|
+
<td class="cell">Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr
|
19
|
+
class="row3">
|
20
|
+
<td class="cell">Value 1a</td>
|
21
|
+
<td class="cell">Value 2a</td>
|
22
|
+
<td class="cell">Value 3a</td>
|
23
|
+
</tr>
|
24
|
+
</tbody>
|
25
|
+
</table>
|
26
|
+
```
|
27
|
+
|
28
|
+
|
@@ -2,25 +2,25 @@
|
|
2
2
|
|
3
3
|
```
|
4
4
|
<table>
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
5
|
+
<thead>
|
6
|
+
<tr>
|
7
|
+
<th>Column 1</th>
|
8
|
+
<th>Column 2</th>
|
9
|
+
<th>Column 3</th>
|
10
|
+
</tr>
|
11
|
+
</thead>
|
12
|
+
<tbody>
|
13
|
+
<tr>
|
14
|
+
<td>Value 1</td>
|
15
|
+
<td>Value 2</td>
|
16
|
+
<td>Value 3</td>
|
17
|
+
</tr>
|
18
|
+
<tr>
|
19
|
+
<td>Value 1a</td>
|
20
|
+
<td>Value 2a</td>
|
21
|
+
<td>Value 3a</td>
|
22
|
+
</tr>
|
23
|
+
</tbody>
|
24
24
|
</table>
|
25
25
|
```
|
26
26
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
<div>
|
2
|
+
<div>This is a test this is only a test<br/></div>
|
3
|
+
<div>
|
4
|
+
<br/>
|
5
|
+
</div>
|
6
|
+
<code><table>
|
7
|
+
  <thead>
|
8
|
+
    <tr class="row">
|
9
|
+
      <th class="header">Column 1</th>
|
10
|
+
      <th class="header">Column 2</th>
|
11
|
+
      <th class="header">Column 3</th>
|
12
|
+
    </tr>
|
13
|
+
  </thead>
|
14
|
+
  <tbody>
|
15
|
+
|
16
|
+
    <tr class="row">
|
17
|
+
|
18
|
+
      <td class="cell">Value 1</td>
|
19
|
+
      <td class="cell">Value 2</td>
|
20
|
+
      <td class="cell">Value 3</td>
|
21
|
+
    </tr>
|
22
|
+
|
23
|
+
    <tr
|
24
|
+
 class="row3">
|
25
|
+
      <td class="cell">Value 1a</td>
|
26
|
+
      <td class="cell">Value 2a</td>
|
27
|
+
      <td class="cell">Value 3a</td>
|
28
|
+
    </tr>
|
29
|
+
  </tbody>
|
30
|
+
</table></code>
|
31
|
+
<div>
|
32
|
+
<br/>
|
33
|
+
</div>
|
34
|
+
<div>
|
35
|
+
<br/>
|
36
|
+
</div>
|
37
|
+
</div>
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -96,7 +96,10 @@ files:
|
|
96
96
|
- spec/code.html
|
97
97
|
- spec/code_spec.rb
|
98
98
|
- spec/code_with_language.markdown
|
99
|
+
- spec/code_with_nbsp.html
|
100
|
+
- spec/code_with_nbsp.markdown
|
99
101
|
- spec/code_without_language.markdown
|
102
|
+
- spec/code_wth_nbsp.html
|
100
103
|
- spec/doc.html
|
101
104
|
- spec/doc.markdown
|
102
105
|
- spec/doc_spec.rb
|
@@ -124,7 +127,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
124
127
|
version: '0'
|
125
128
|
segments:
|
126
129
|
- 0
|
127
|
-
hash: -
|
130
|
+
hash: -237341598759063223
|
128
131
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
132
|
none: false
|
130
133
|
requirements:
|
@@ -133,7 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
133
136
|
version: '0'
|
134
137
|
segments:
|
135
138
|
- 0
|
136
|
-
hash: -
|
139
|
+
hash: -237341598759063223
|
137
140
|
requirements: []
|
138
141
|
rubyforge_project:
|
139
142
|
rubygems_version: 1.8.23
|
@@ -146,7 +149,10 @@ test_files:
|
|
146
149
|
- spec/code.html
|
147
150
|
- spec/code_spec.rb
|
148
151
|
- spec/code_with_language.markdown
|
152
|
+
- spec/code_with_nbsp.html
|
153
|
+
- spec/code_with_nbsp.markdown
|
149
154
|
- spec/code_without_language.markdown
|
155
|
+
- spec/code_wth_nbsp.html
|
150
156
|
- spec/doc.html
|
151
157
|
- spec/doc.markdown
|
152
158
|
- spec/doc_spec.rb
|