markitdown 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Markitdown [![Build Status](https://secure.travis-ci.org/cpetersen/markitdown.png)](http://travis-ci.org/cpetersen/markitdown)
1
+ # Markitdown [![Build Status](https://secure.travis-ci.org/cpetersen/markitdown.png)](http://travis-ci.org/cpetersen/markitdown) [![Coverage Status](https://coveralls.io/repos/cpetersen/markitdown/badge.png?branch=master)](https://coveralls.io/r/cpetersen/markitdown?branch=master)
2
2
 
3
3
  Markitdown is a Ruby library that converts HTML to Markdown. It's powered by Nokogiri. It supports:
4
4
 
@@ -4,20 +4,20 @@ require "markitdown/version"
4
4
  require "nokogiri"
5
5
 
6
6
  module Markitdown
7
- def self.from_html(html)
8
- from_nokogiri(Nokogiri::XML(html).root)
7
+ def self.from_html(html, language_classifier=nil)
8
+ from_nokogiri(Nokogiri::XML(html).root, language_classifier)
9
9
  end
10
10
 
11
- def self.from_nokogiri(node)
11
+ def self.from_nokogiri(node, language_classifier=nil)
12
12
  # gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
13
13
  # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
14
14
  # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
15
15
  # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
16
- self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
16
+ self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
17
17
  end
18
18
 
19
19
  private
20
- def self.parse_node(node, states=[])
20
+ def self.parse_node(node, states=[], language_classifier=nil)
21
21
  results=[]
22
22
  after = nil
23
23
  states.unshift node.name.downcase
@@ -141,7 +141,13 @@ module Markitdown
141
141
  results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
142
142
  when "code"
143
143
  if node.text.include?("\n")
144
- results << "\n\n```\n#{node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")}\n```\n\n"
144
+ text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
145
+ if language_classifier
146
+ language = language_classifier.classify(text)
147
+ results << "\n\n```#{language}\n#{text}\n```\n\n"
148
+ else
149
+ results << "\n\n```\n#{text}\n```\n\n"
150
+ end
145
151
  else
146
152
  results << " `#{node.text}` "
147
153
  end
@@ -171,7 +177,7 @@ module Markitdown
171
177
 
172
178
  if recurse
173
179
  node.children.each do |child|
174
- contents = self.parse_node(child, states)
180
+ contents = self.parse_node(child, states, language_classifier)
175
181
  contents = contents.flatten.compact.join.strip if strip_content
176
182
  contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
177
183
  results << contents
@@ -1,3 +1,3 @@
1
1
  module Markitdown
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -11,6 +11,7 @@ Gem::Specification.new do |gem|
11
11
  gem.add_dependency('nokogiri')
12
12
  gem.add_development_dependency('rake')
13
13
  gem.add_development_dependency('rspec')
14
+ gem.add_development_dependency('coveralls')
14
15
 
15
16
  gem.files = `git ls-files`.split($\)
16
17
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
@@ -0,0 +1,38 @@
1
+ <div>
2
+ This is an html block
3
+ <code>
4
+ &lt;table&gt;
5
+   &lt;thead&gt;
6
+     &lt;tr&gt;
7
+       &lt;th&gt;Column 1&lt;/th&gt;
8
+       &lt;th&gt;Column 2&lt;/th&gt;
9
+       &lt;th&gt;Column 3&lt;/th&gt;
10
+     &lt;/tr&gt;
11
+   &lt;/thead&gt;
12
+   &lt;tbody&gt;
13
+     &lt;tr&gt;
14
+       &lt;td&gt;Value 1&lt;/td&gt;
15
+       &lt;td&gt;Value 2&lt;/td&gt;
16
+       &lt;td&gt;Value 3&lt;/td&gt;
17
+     &lt;/tr&gt;
18
+     &lt;tr&gt;
19
+       &lt;td&gt;Value 1a&lt;/td&gt;
20
+       &lt;td&gt;Value 2a&lt;/td&gt;
21
+       &lt;td&gt;Value 3a&lt;/td&gt;
22
+     &lt;/tr&gt;
23
+   &lt;/tbody&gt;
24
+ &lt;/table&gt;
25
+ </code>
26
+
27
+ This is a ruby block
28
+ <code>
29
+ # GET /blogs/1
30
+ # GET /blogs/1.json
31
+ def show
32
+ respond_to do |format|
33
+ format.html # show.html.erb
34
+ format.json { render json: @blog }
35
+ end
36
+ end
37
+ </code>
38
+ </div>
@@ -0,0 +1,25 @@
1
+ require 'markitdown'
2
+ require 'spec_helper'
3
+
4
+ describe Markitdown do
5
+ context "When parsing codeblocks" do
6
+ let(:html) { File.read("spec/code.html") }
7
+
8
+ context "and not guessing the language" do
9
+ let(:markdown) { File.read("spec/code_without_language.markdown") }
10
+
11
+ it "should produce valid markdown" do
12
+ Markitdown.from_html(html).should == markdown
13
+ end
14
+ end
15
+
16
+ context "and guessing the language" do
17
+ let(:markdown) { File.read("spec/code_with_language.markdown") }
18
+ let(:classifier) { TestLanguageClassifier.new }
19
+
20
+ it "should produce valid markdown" do
21
+ Markitdown.from_html(html, classifier).should == markdown
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,40 @@
1
+ This is an html block
2
+
3
+ ```html
4
+ <table>
5
+   <thead>
6
+     <tr>
7
+       <th>Column 1</th>
8
+       <th>Column 2</th>
9
+       <th>Column 3</th>
10
+     </tr>
11
+   </thead>
12
+   <tbody>
13
+     <tr>
14
+       <td>Value 1</td>
15
+       <td>Value 2</td>
16
+       <td>Value 3</td>
17
+     </tr>
18
+     <tr>
19
+       <td>Value 1a</td>
20
+       <td>Value 2a</td>
21
+       <td>Value 3a</td>
22
+     </tr>
23
+   </tbody>
24
+ </table>
25
+ ```
26
+
27
+ This is a ruby block
28
+
29
+ ```ruby
30
+ # GET /blogs/1
31
+ # GET /blogs/1.json
32
+ def show
33
+ respond_to do |format|
34
+ format.html # show.html.erb
35
+ format.json { render json: @blog }
36
+ end
37
+ end
38
+ ```
39
+
40
+
@@ -0,0 +1,40 @@
1
+ This is an html block
2
+
3
+ ```
4
+ <table>
5
+   <thead>
6
+     <tr>
7
+       <th>Column 1</th>
8
+       <th>Column 2</th>
9
+       <th>Column 3</th>
10
+     </tr>
11
+   </thead>
12
+   <tbody>
13
+     <tr>
14
+       <td>Value 1</td>
15
+       <td>Value 2</td>
16
+       <td>Value 3</td>
17
+     </tr>
18
+     <tr>
19
+       <td>Value 1a</td>
20
+       <td>Value 2a</td>
21
+       <td>Value 3a</td>
22
+     </tr>
23
+   </tbody>
24
+ </table>
25
+ ```
26
+
27
+ This is a ruby block
28
+
29
+ ```
30
+ # GET /blogs/1
31
+ # GET /blogs/1.json
32
+ def show
33
+ respond_to do |format|
34
+ format.html # show.html.erb
35
+ format.json { render json: @blog }
36
+ end
37
+ end
38
+ ```
39
+
40
+
@@ -1,4 +1,5 @@
1
1
  require 'markitdown'
2
+ require 'spec_helper'
2
3
 
3
4
  describe Markitdown do
4
5
  context "When parsing a document" do
@@ -1,4 +1,5 @@
1
1
  require 'markitdown'
2
+ require 'spec_helper'
2
3
 
3
4
  describe Markitdown do
4
5
  context "when parsing nested ordered lists" do
@@ -0,0 +1,15 @@
1
+ require 'coveralls'
2
+
3
+ class TestLanguageClassifier
4
+ def classify(code)
5
+ if code
6
+ if code.match /<table>/
7
+ return "html"
8
+ elsif code.match /def/
9
+ return "ruby"
10
+ end
11
+ end
12
+ end
13
+ end
14
+
15
+ Coveralls.wear!
@@ -1,4 +1,5 @@
1
1
  require 'markitdown'
2
+ require 'spec_helper'
2
3
 
3
4
  describe Markitdown do
4
5
  context "When parsing a table with a thead and tbody" do
@@ -1,4 +1,5 @@
1
1
  require 'markitdown'
2
+ require 'spec_helper'
2
3
 
3
4
  describe Markitdown do
4
5
  context "When parsing a paragraph" do
@@ -166,7 +167,7 @@ describe Markitdown do
166
167
 
167
168
  it "should return valid markdown with spaces" do
168
169
  pending "Still need to figure out leading spaces for <sup> elements"
169
- Markitdown.from_html(html).should == "This ^(is a) test"
170
+ # Markitdown.from_html(html).should == "This ^(is a) test"
170
171
  end
171
172
  end
172
173
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markitdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-06 00:00:00.000000000 Z
12
+ date: 2013-08-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -59,6 +59,22 @@ dependencies:
59
59
  - - ! '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: coveralls
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
62
78
  description: A library that uses Nokogiri to parse HTML and produce Markdown
63
79
  email:
64
80
  - christopher.petersen@gmail.com
@@ -77,12 +93,17 @@ files:
77
93
  - markitdown.gemspec
78
94
  - spec/asmartbear.html
79
95
  - spec/asmartbear.markdown
96
+ - spec/code.html
97
+ - spec/code_spec.rb
98
+ - spec/code_with_language.markdown
99
+ - spec/code_without_language.markdown
80
100
  - spec/doc.html
81
101
  - spec/doc.markdown
82
102
  - spec/doc_spec.rb
83
103
  - spec/evernote.markdown
84
104
  - spec/evernote.xml
85
105
  - spec/nesting_spec.rb
106
+ - spec/spec_helper.rb
86
107
  - spec/table.html
87
108
  - spec/table.markdown
88
109
  - spec/table2.html
@@ -103,7 +124,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
103
124
  version: '0'
104
125
  segments:
105
126
  - 0
106
- hash: -2638333383769331236
127
+ hash: -1182280358588269194
107
128
  required_rubygems_version: !ruby/object:Gem::Requirement
108
129
  none: false
109
130
  requirements:
@@ -112,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
133
  version: '0'
113
134
  segments:
114
135
  - 0
115
- hash: -2638333383769331236
136
+ hash: -1182280358588269194
116
137
  requirements: []
117
138
  rubyforge_project:
118
139
  rubygems_version: 1.8.23
@@ -122,12 +143,17 @@ summary: Converts HTML to Markdown
122
143
  test_files:
123
144
  - spec/asmartbear.html
124
145
  - spec/asmartbear.markdown
146
+ - spec/code.html
147
+ - spec/code_spec.rb
148
+ - spec/code_with_language.markdown
149
+ - spec/code_without_language.markdown
125
150
  - spec/doc.html
126
151
  - spec/doc.markdown
127
152
  - spec/doc_spec.rb
128
153
  - spec/evernote.markdown
129
154
  - spec/evernote.xml
130
155
  - spec/nesting_spec.rb
156
+ - spec/spec_helper.rb
131
157
  - spec/table.html
132
158
  - spec/table.markdown
133
159
  - spec/table2.html