markitdown 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Markitdown [![Build Status](https://secure.travis-ci.org/cpetersen/markitdown.png)](http://travis-ci.org/cpetersen/markitdown)
1
+ # Markitdown [![Build Status](https://secure.travis-ci.org/cpetersen/markitdown.png)](http://travis-ci.org/cpetersen/markitdown) [![Coverage Status](https://coveralls.io/repos/cpetersen/markitdown/badge.png?branch=master)](https://coveralls.io/r/cpetersen/markitdown?branch=master)
2
2
 
3
3
  Markitdown is a Ruby library that converts HTML to Markdown. It's powered by Nokogiri. It supports:
4
4
 
@@ -4,20 +4,20 @@ require "markitdown/version"
4
4
  require "nokogiri"
5
5
 
6
6
  module Markitdown
7
- def self.from_html(html)
8
- from_nokogiri(Nokogiri::XML(html).root)
7
+ def self.from_html(html, language_classifier=nil)
8
+ from_nokogiri(Nokogiri::XML(html).root, language_classifier)
9
9
  end
10
10
 
11
- def self.from_nokogiri(node)
11
+ def self.from_nokogiri(node, language_classifier=nil)
12
12
  # gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
13
13
  # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
14
14
  # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
15
15
  # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
16
- self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
16
+ self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
17
17
  end
18
18
 
19
19
  private
20
- def self.parse_node(node, states=[])
20
+ def self.parse_node(node, states=[], language_classifier=nil)
21
21
  results=[]
22
22
  after = nil
23
23
  states.unshift node.name.downcase
@@ -141,7 +141,13 @@ module Markitdown
141
141
  results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
142
142
  when "code"
143
143
  if node.text.include?("\n")
144
- results << "\n\n```\n#{node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")}\n```\n\n"
144
+ text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
145
+ if language_classifier
146
+ language = language_classifier.classify(text)
147
+ results << "\n\n```#{language}\n#{text}\n```\n\n"
148
+ else
149
+ results << "\n\n```\n#{text}\n```\n\n"
150
+ end
145
151
  else
146
152
  results << " `#{node.text}` "
147
153
  end
@@ -171,7 +177,7 @@ module Markitdown
171
177
 
172
178
  if recurse
173
179
  node.children.each do |child|
174
- contents = self.parse_node(child, states)
180
+ contents = self.parse_node(child, states, language_classifier)
175
181
  contents = contents.flatten.compact.join.strip if strip_content
176
182
  contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
177
183
  results << contents
@@ -1,3 +1,3 @@
1
1
  module Markitdown
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -11,6 +11,7 @@ Gem::Specification.new do |gem|
11
11
  gem.add_dependency('nokogiri')
12
12
  gem.add_development_dependency('rake')
13
13
  gem.add_development_dependency('rspec')
14
+ gem.add_development_dependency('coveralls')
14
15
 
15
16
  gem.files = `git ls-files`.split($\)
16
17
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
@@ -0,0 +1,38 @@
1
+ <div>
2
+ This is an html block
3
+ <code>
4
+ &lt;table&gt;
5
+   &lt;thead&gt;
6
+     &lt;tr&gt;
7
+       &lt;th&gt;Column 1&lt;/th&gt;
8
+       &lt;th&gt;Column 2&lt;/th&gt;
9
+       &lt;th&gt;Column 3&lt;/th&gt;
10
+     &lt;/tr&gt;
11
+   &lt;/thead&gt;
12
+   &lt;tbody&gt;
13
+     &lt;tr&gt;
14
+       &lt;td&gt;Value 1&lt;/td&gt;
15
+       &lt;td&gt;Value 2&lt;/td&gt;
16
+       &lt;td&gt;Value 3&lt;/td&gt;
17
+     &lt;/tr&gt;
18
+     &lt;tr&gt;
19
+       &lt;td&gt;Value 1a&lt;/td&gt;
20
+       &lt;td&gt;Value 2a&lt;/td&gt;
21
+       &lt;td&gt;Value 3a&lt;/td&gt;
22
+     &lt;/tr&gt;
23
+   &lt;/tbody&gt;
24
+ &lt;/table&gt;
25
+ </code>
26
+
27
+ This is a ruby block
28
+ <code>
29
+ # GET /blogs/1
30
+ # GET /blogs/1.json
31
+ def show
32
+ respond_to do |format|
33
+ format.html # show.html.erb
34
+ format.json { render json: @blog }
35
+ end
36
+ end
37
+ </code>
38
+ </div>
@@ -0,0 +1,25 @@
1
+ require 'markitdown'
2
+ require 'spec_helper'
3
+
4
+ describe Markitdown do
5
+ context "When parsing codeblocks" do
6
+ let(:html) { File.read("spec/code.html") }
7
+
8
+ context "and not guessing the language" do
9
+ let(:markdown) { File.read("spec/code_without_language.markdown") }
10
+
11
+ it "should produce valid markdown" do
12
+ Markitdown.from_html(html).should == markdown
13
+ end
14
+ end
15
+
16
+ context "and guessing the language" do
17
+ let(:markdown) { File.read("spec/code_with_language.markdown") }
18
+ let(:classifier) { TestLanguageClassifier.new }
19
+
20
+ it "should produce valid markdown" do
21
+ Markitdown.from_html(html, classifier).should == markdown
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,40 @@
1
+ This is an html block
2
+
3
+ ```html
4
+ <table>
5
+   <thead>
6
+     <tr>
7
+       <th>Column 1</th>
8
+       <th>Column 2</th>
9
+       <th>Column 3</th>
10
+     </tr>
11
+   </thead>
12
+   <tbody>
13
+     <tr>
14
+       <td>Value 1</td>
15
+       <td>Value 2</td>
16
+       <td>Value 3</td>
17
+     </tr>
18
+     <tr>
19
+       <td>Value 1a</td>
20
+       <td>Value 2a</td>
21
+       <td>Value 3a</td>
22
+     </tr>
23
+   </tbody>
24
+ </table>
25
+ ```
26
+
27
+ This is a ruby block
28
+
29
+ ```ruby
30
+ # GET /blogs/1
31
+ # GET /blogs/1.json
32
+ def show
33
+ respond_to do |format|
34
+ format.html # show.html.erb
35
+ format.json { render json: @blog }
36
+ end
37
+ end
38
+ ```
39
+
40
+
@@ -0,0 +1,40 @@
1
+ This is an html block
2
+
3
+ ```
4
+ <table>
5
+   <thead>
6
+     <tr>
7
+       <th>Column 1</th>
8
+       <th>Column 2</th>
9
+       <th>Column 3</th>
10
+     </tr>
11
+   </thead>
12
+   <tbody>
13
+     <tr>
14
+       <td>Value 1</td>
15
+       <td>Value 2</td>
16
+       <td>Value 3</td>
17
+     </tr>
18
+     <tr>
19
+       <td>Value 1a</td>
20
+       <td>Value 2a</td>
21
+       <td>Value 3a</td>
22
+     </tr>
23
+   </tbody>
24
+ </table>
25
+ ```
26
+
27
+ This is a ruby block
28
+
29
+ ```
30
+ # GET /blogs/1
31
+ # GET /blogs/1.json
32
+ def show
33
+ respond_to do |format|
34
+ format.html # show.html.erb
35
+ format.json { render json: @blog }
36
+ end
37
+ end
38
+ ```
39
+
40
+
@@ -1,4 +1,5 @@
1
1
  require 'markitdown'
2
+ require 'spec_helper'
2
3
 
3
4
  describe Markitdown do
4
5
  context "When parsing a document" do
@@ -1,4 +1,5 @@
1
1
  require 'markitdown'
2
+ require 'spec_helper'
2
3
 
3
4
  describe Markitdown do
4
5
  context "when parsing nested ordered lists" do
@@ -0,0 +1,15 @@
1
+ require 'coveralls'
2
+
3
+ class TestLanguageClassifier
4
+ def classify(code)
5
+ if code
6
+ if code.match /<table>/
7
+ return "html"
8
+ elsif code.match /def/
9
+ return "ruby"
10
+ end
11
+ end
12
+ end
13
+ end
14
+
15
+ Coveralls.wear!
@@ -1,4 +1,5 @@
1
1
  require 'markitdown'
2
+ require 'spec_helper'
2
3
 
3
4
  describe Markitdown do
4
5
  context "When parsing a table with a thead and tbody" do
@@ -1,4 +1,5 @@
1
1
  require 'markitdown'
2
+ require 'spec_helper'
2
3
 
3
4
  describe Markitdown do
4
5
  context "When parsing a paragraph" do
@@ -166,7 +167,7 @@ describe Markitdown do
166
167
 
167
168
  it "should return valid markdown with spaces" do
168
169
  pending "Still need to figure out leading spaces for <sup> elements"
169
- Markitdown.from_html(html).should == "This ^(is a) test"
170
+ # Markitdown.from_html(html).should == "This ^(is a) test"
170
171
  end
171
172
  end
172
173
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markitdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-06 00:00:00.000000000 Z
12
+ date: 2013-08-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -59,6 +59,22 @@ dependencies:
59
59
  - - ! '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: coveralls
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
62
78
  description: A library that uses Nokogiri to parse HTML and produce Markdown
63
79
  email:
64
80
  - christopher.petersen@gmail.com
@@ -77,12 +93,17 @@ files:
77
93
  - markitdown.gemspec
78
94
  - spec/asmartbear.html
79
95
  - spec/asmartbear.markdown
96
+ - spec/code.html
97
+ - spec/code_spec.rb
98
+ - spec/code_with_language.markdown
99
+ - spec/code_without_language.markdown
80
100
  - spec/doc.html
81
101
  - spec/doc.markdown
82
102
  - spec/doc_spec.rb
83
103
  - spec/evernote.markdown
84
104
  - spec/evernote.xml
85
105
  - spec/nesting_spec.rb
106
+ - spec/spec_helper.rb
86
107
  - spec/table.html
87
108
  - spec/table.markdown
88
109
  - spec/table2.html
@@ -103,7 +124,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
103
124
  version: '0'
104
125
  segments:
105
126
  - 0
106
- hash: -2638333383769331236
127
+ hash: -1182280358588269194
107
128
  required_rubygems_version: !ruby/object:Gem::Requirement
108
129
  none: false
109
130
  requirements:
@@ -112,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
133
  version: '0'
113
134
  segments:
114
135
  - 0
115
- hash: -2638333383769331236
136
+ hash: -1182280358588269194
116
137
  requirements: []
117
138
  rubyforge_project:
118
139
  rubygems_version: 1.8.23
@@ -122,12 +143,17 @@ summary: Converts HTML to Markdown
122
143
  test_files:
123
144
  - spec/asmartbear.html
124
145
  - spec/asmartbear.markdown
146
+ - spec/code.html
147
+ - spec/code_spec.rb
148
+ - spec/code_with_language.markdown
149
+ - spec/code_without_language.markdown
125
150
  - spec/doc.html
126
151
  - spec/doc.markdown
127
152
  - spec/doc_spec.rb
128
153
  - spec/evernote.markdown
129
154
  - spec/evernote.xml
130
155
  - spec/nesting_spec.rb
156
+ - spec/spec_helper.rb
131
157
  - spec/table.html
132
158
  - spec/table.markdown
133
159
  - spec/table2.html