markitdown 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/README.md +2 -0
- data/lib/markitdown/version.rb +1 -1
- data/lib/markitdown.rb +17 -8
- data/spec/asmartbear.html +23 -0
- data/spec/asmartbear.markdown +10 -0
- data/spec/doc.markdown +1 -1
- data/spec/doc_spec.rb +8 -0
- data/spec/tag_spec.rb +3 -3
- metadata +9 -5
data/.gitignore
CHANGED
data/README.md
CHANGED
data/lib/markitdown/version.rb
CHANGED
data/lib/markitdown.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding=utf-8
|
2
|
+
|
1
3
|
require "markitdown/version"
|
2
4
|
require "nokogiri"
|
3
5
|
|
@@ -11,7 +13,7 @@ module Markitdown
|
|
11
13
|
# gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
|
12
14
|
# gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
|
13
15
|
# gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
|
14
|
-
self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1')
|
16
|
+
self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
|
15
17
|
end
|
16
18
|
|
17
19
|
private
|
@@ -69,17 +71,17 @@ module Markitdown
|
|
69
71
|
when "br"
|
70
72
|
results << self.newline(pre, nil, 2)
|
71
73
|
when "em"
|
72
|
-
results << "
|
73
|
-
after = "
|
74
|
+
results << " _"
|
75
|
+
after = "END_TAG(_) "
|
74
76
|
when "i"
|
75
|
-
results << "
|
76
|
-
after = "
|
77
|
+
results << " _"
|
78
|
+
after = "END_TAG(_) "
|
77
79
|
when "strong"
|
78
80
|
results << " **"
|
79
|
-
after = "** "
|
81
|
+
after = "END_TAG(**) "
|
80
82
|
when "b"
|
81
83
|
results << " **"
|
82
|
-
after = "** "
|
84
|
+
after = "END_TAG(**) "
|
83
85
|
when "blockquote"
|
84
86
|
results << "\n\n"
|
85
87
|
results << pre
|
@@ -136,7 +138,14 @@ module Markitdown
|
|
136
138
|
results << contents
|
137
139
|
end
|
138
140
|
end
|
139
|
-
|
141
|
+
if strip_content
|
142
|
+
last_tags = results.pop
|
143
|
+
after = after.flatten.compact.join if after.is_a?(Array)
|
144
|
+
last_tags = "#{last_tags}#{after}"
|
145
|
+
results << last_tags
|
146
|
+
else
|
147
|
+
results << after
|
148
|
+
end
|
140
149
|
states.shift
|
141
150
|
results
|
142
151
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
<div>
|
2
|
+
A <a href="http://blog.smartbear.com/careers/how-to-speak-to-a-technical-person/">smart bear writes about how to talk to a technical person</a>. Some of these points really hit home for me. When I'm with my friends, I get sarcasm, but in interactions where I don't expect it, it often goes over my head. Also ambiguous pronouns are one of my biggest pet peeves both in technical and casual conversation. For instance, "My friend was talking to her boss, she said X". Did your friend say X, or her boss?
|
3
|
+
|
4
|
+
<blockquote>
|
5
|
+
<br clear="none"/>
|
6
|
+
<div style="position: relative">
|
7
|
+
<div style="font-size: 16px">
|
8
|
+
<div style="text-decoration:none;background-color:rgb(255, 255, 255);color:rgb(51, 51, 51);font-size:16px;font-family:Arial, sans-serif;line-height:1.5625;">
|
9
|
+
<div style="background-color:rgb(255, 255, 255);">
|
10
|
+
<div style="background-color:rgb(255, 255, 255);overflow:hidden;">
|
11
|
+
<div style="background-color:rgb(255, 255, 255);">
|
12
|
+
<div style="overflow:hidden;">
|
13
|
+
<span style="line-height: 1.5625;">Try to avoid using vague words like "one" or "it" to describe an issue. For example:<i style="line-height: 1.5625;">"When I opened the document, the content was askew. When I tried to print the document, the printer jammed up.<b>It</b>cannot blow up like that!"</i><span style="line-height: 1.5625;">In the last sentence of this example, are you complaining about the document or the printer? It's not really clear. So I would repeat myself even at the risk of sounding redundant. It's better to be redundant and clear than to leave the other person wondering.</span><br/>
|
14
|
+
</div>
|
15
|
+
</div>
|
16
|
+
</div>
|
17
|
+
</div>
|
18
|
+
</div>
|
19
|
+
</div>
|
20
|
+
</div>
|
21
|
+
<br clear="none"/>
|
22
|
+
</blockquote>
|
23
|
+
</div>
|
@@ -0,0 +1,10 @@
|
|
1
|
+
A [smart bear writes about how to talk to a technical person](http://blog.smartbear.com/careers/how-to-speak-to-a-technical-person/). Some of these points really hit home for me. When I'm with my friends, I get sarcasm, but in interactions where I don't expect it, it often goes over my head. Also ambiguous pronouns are one of my biggest pet peeves both in technical and casual conversation. For instance, "My friend was talking to her boss, she said X". Did your friend say X, or her boss?
|
2
|
+
|
3
|
+
>
|
4
|
+
>
|
5
|
+
> Try to avoid using vague words like "one" or "it" to describe an issue. For example: _"When I opened the document, the content was askew. When I tried to print the document, the printer jammed up. **It** cannot blow up like that!"_ In the last sentence of this example, are you complaining about the document or the printer? It's not really clear. So I would repeat myself even at the risk of sounding redundant. It's better to be redundant and clear than to leave the other person wondering.
|
6
|
+
>
|
7
|
+
>
|
8
|
+
>
|
9
|
+
>
|
10
|
+
|
data/spec/doc.markdown
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# Main Header
|
4
4
|
|
5
|
-
This
|
5
|
+
This _is_ a **test**. It includes a [link](http://www.google.com) as well as an image ![Google Logo](https://www.google.com/images/srpr/logo3w.png)
|
6
6
|
|
7
7
|
* bullet 1
|
8
8
|
* bullet 2
|
data/spec/doc_spec.rb
CHANGED
@@ -9,6 +9,14 @@ describe Markitdown do
|
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
|
+
context "When parsing the document 'asmartbear'" do
|
13
|
+
let(:html) { File.read("spec/asmartbear.html") }
|
14
|
+
|
15
|
+
it "should produce valid markdown" do
|
16
|
+
Markitdown.from_html(html).should == File.read("spec/asmartbear.markdown")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
12
20
|
context "When parsing an evernote document" do
|
13
21
|
let(:xml) { File.read("spec/evernote.xml") }
|
14
22
|
|
data/spec/tag_spec.rb
CHANGED
@@ -77,7 +77,7 @@ describe Markitdown do
|
|
77
77
|
let(:html) { "<em>emphasis added</em>" }
|
78
78
|
|
79
79
|
it "should return valid markdown" do
|
80
|
-
Markitdown.from_html(html).should == "
|
80
|
+
Markitdown.from_html(html).should == " _emphasis added_ "
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
@@ -85,7 +85,7 @@ describe Markitdown do
|
|
85
85
|
let(:html) { "<i>italics added</i>" }
|
86
86
|
|
87
87
|
it "should return valid markdown" do
|
88
|
-
Markitdown.from_html(html).should == "
|
88
|
+
Markitdown.from_html(html).should == " _italics added_ "
|
89
89
|
end
|
90
90
|
end
|
91
91
|
|
@@ -117,7 +117,7 @@ describe Markitdown do
|
|
117
117
|
let(:html) { "<html><em>emphasis added</em>?</html>" }
|
118
118
|
|
119
119
|
it "should return valid markdown without a space" do
|
120
|
-
Markitdown.from_html(html).should == "
|
120
|
+
Markitdown.from_html(html).should == " _emphasis added_?"
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markitdown
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05
|
12
|
+
date: 2013-07-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -75,6 +75,8 @@ files:
|
|
75
75
|
- lib/markitdown.rb
|
76
76
|
- lib/markitdown/version.rb
|
77
77
|
- markitdown.gemspec
|
78
|
+
- spec/asmartbear.html
|
79
|
+
- spec/asmartbear.markdown
|
78
80
|
- spec/doc.html
|
79
81
|
- spec/doc.markdown
|
80
82
|
- spec/doc_spec.rb
|
@@ -96,7 +98,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
96
98
|
version: '0'
|
97
99
|
segments:
|
98
100
|
- 0
|
99
|
-
hash:
|
101
|
+
hash: 73152326707210871
|
100
102
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
103
|
none: false
|
102
104
|
requirements:
|
@@ -105,14 +107,16 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
105
107
|
version: '0'
|
106
108
|
segments:
|
107
109
|
- 0
|
108
|
-
hash:
|
110
|
+
hash: 73152326707210871
|
109
111
|
requirements: []
|
110
112
|
rubyforge_project:
|
111
|
-
rubygems_version: 1.8.
|
113
|
+
rubygems_version: 1.8.23
|
112
114
|
signing_key:
|
113
115
|
specification_version: 3
|
114
116
|
summary: Converts HTML to Markdown
|
115
117
|
test_files:
|
118
|
+
- spec/asmartbear.html
|
119
|
+
- spec/asmartbear.markdown
|
116
120
|
- spec/doc.html
|
117
121
|
- spec/doc.markdown
|
118
122
|
- spec/doc_spec.rb
|