markitdown 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/lib/markitdown.rb +13 -7
- data/lib/markitdown/version.rb +1 -1
- data/markitdown.gemspec +1 -0
- data/spec/code.html +38 -0
- data/spec/code_spec.rb +25 -0
- data/spec/code_with_language.markdown +40 -0
- data/spec/code_without_language.markdown +40 -0
- data/spec/doc_spec.rb +1 -0
- data/spec/nesting_spec.rb +1 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/table_spec.rb +1 -0
- data/spec/tag_spec.rb +2 -1
- metadata +30 -4
    
        data/README.md
    CHANGED
    
    | @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            # Markitdown [](http://travis-ci.org/cpetersen/markitdown)
         | 
| 1 | 
            +
            # Markitdown [](http://travis-ci.org/cpetersen/markitdown) [](https://coveralls.io/r/cpetersen/markitdown?branch=master)
         | 
| 2 2 |  | 
| 3 3 | 
             
            Markitdown is a Ruby library that converts HTML to Markdown. It's powered by Nokogiri. It supports:
         | 
| 4 4 |  | 
    
        data/lib/markitdown.rb
    CHANGED
    
    | @@ -4,20 +4,20 @@ require "markitdown/version" | |
| 4 4 | 
             
            require "nokogiri"
         | 
| 5 5 |  | 
| 6 6 | 
             
            module Markitdown
         | 
| 7 | 
            -
              def self.from_html(html)
         | 
| 8 | 
            -
                from_nokogiri(Nokogiri::XML(html).root)
         | 
| 7 | 
            +
              def self.from_html(html, language_classifier=nil)
         | 
| 8 | 
            +
                from_nokogiri(Nokogiri::XML(html).root, language_classifier)
         | 
| 9 9 | 
             
              end
         | 
| 10 10 |  | 
| 11 | 
            -
              def self.from_nokogiri(node)
         | 
| 11 | 
            +
              def self.from_nokogiri(node, language_classifier=nil)
         | 
| 12 12 | 
             
                # gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
         | 
| 13 13 | 
             
                # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
         | 
| 14 14 | 
             
                # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
         | 
| 15 15 | 
             
                # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
         | 
| 16 | 
            -
                self.parse_node(node).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
         | 
| 16 | 
            +
                self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1")
         | 
| 17 17 | 
             
              end
         | 
| 18 18 |  | 
| 19 19 | 
             
              private
         | 
| 20 | 
            -
              def self.parse_node(node, states=[])
         | 
| 20 | 
            +
              def self.parse_node(node, states=[], language_classifier=nil)
         | 
| 21 21 | 
             
                results=[]
         | 
| 22 22 | 
             
                after = nil
         | 
| 23 23 | 
             
                states.unshift node.name.downcase
         | 
| @@ -141,7 +141,13 @@ module Markitdown | |
| 141 141 | 
             
                  results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
         | 
| 142 142 | 
             
                when "code"
         | 
| 143 143 | 
             
                  if node.text.include?("\n")
         | 
| 144 | 
            -
                     | 
| 144 | 
            +
                    text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"")
         | 
| 145 | 
            +
                    if language_classifier
         | 
| 146 | 
            +
                      language = language_classifier.classify(text)
         | 
| 147 | 
            +
                      results << "\n\n```#{language}\n#{text}\n```\n\n"
         | 
| 148 | 
            +
                    else          
         | 
| 149 | 
            +
                      results << "\n\n```\n#{text}\n```\n\n"
         | 
| 150 | 
            +
                    end
         | 
| 145 151 | 
             
                  else
         | 
| 146 152 | 
             
                    results << " `#{node.text}` "
         | 
| 147 153 | 
             
                  end
         | 
| @@ -171,7 +177,7 @@ module Markitdown | |
| 171 177 |  | 
| 172 178 | 
             
                if recurse
         | 
| 173 179 | 
             
                  node.children.each do |child|
         | 
| 174 | 
            -
                    contents = self.parse_node(child, states)
         | 
| 180 | 
            +
                    contents = self.parse_node(child, states, language_classifier)
         | 
| 175 181 | 
             
                    contents = contents.flatten.compact.join.strip if strip_content
         | 
| 176 182 | 
             
                    contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
         | 
| 177 183 | 
             
                    results << contents
         | 
    
        data/lib/markitdown/version.rb
    CHANGED
    
    
    
        data/markitdown.gemspec
    CHANGED
    
    | @@ -11,6 +11,7 @@ Gem::Specification.new do |gem| | |
| 11 11 | 
             
              gem.add_dependency('nokogiri')
         | 
| 12 12 | 
             
              gem.add_development_dependency('rake')
         | 
| 13 13 | 
             
              gem.add_development_dependency('rspec')
         | 
| 14 | 
            +
              gem.add_development_dependency('coveralls')
         | 
| 14 15 |  | 
| 15 16 | 
             
              gem.files         = `git ls-files`.split($\)
         | 
| 16 17 | 
             
              gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
         | 
    
        data/spec/code.html
    ADDED
    
    | @@ -0,0 +1,38 @@ | |
| 1 | 
            +
            <div>
         | 
| 2 | 
            +
              This is an html block
         | 
| 3 | 
            +
              <code>
         | 
| 4 | 
            +
                <table>
         | 
| 5 | 
            +
                  <thead>
         | 
| 6 | 
            +
                    <tr>
         | 
| 7 | 
            +
                      <th>Column 1</th>
         | 
| 8 | 
            +
                      <th>Column 2</th>
         | 
| 9 | 
            +
                      <th>Column 3</th>
         | 
| 10 | 
            +
                    </tr>
         | 
| 11 | 
            +
                  </thead>
         | 
| 12 | 
            +
                  <tbody>
         | 
| 13 | 
            +
                    <tr>
         | 
| 14 | 
            +
                      <td>Value 1</td>
         | 
| 15 | 
            +
                      <td>Value 2</td>
         | 
| 16 | 
            +
                      <td>Value 3</td>
         | 
| 17 | 
            +
                    </tr>
         | 
| 18 | 
            +
                    <tr>
         | 
| 19 | 
            +
                      <td>Value 1a</td>
         | 
| 20 | 
            +
                      <td>Value 2a</td>
         | 
| 21 | 
            +
                      <td>Value 3a</td>
         | 
| 22 | 
            +
                    </tr>
         | 
| 23 | 
            +
                  </tbody>
         | 
| 24 | 
            +
                </table>
         | 
| 25 | 
            +
              </code>
         | 
| 26 | 
            +
             | 
| 27 | 
            +
              This is a ruby block
         | 
| 28 | 
            +
              <code>
         | 
| 29 | 
            +
                # GET /blogs/1
         | 
| 30 | 
            +
                # GET /blogs/1.json
         | 
| 31 | 
            +
                def show
         | 
| 32 | 
            +
                  respond_to do |format|
         | 
| 33 | 
            +
                    format.html # show.html.erb
         | 
| 34 | 
            +
                    format.json { render json: @blog }
         | 
| 35 | 
            +
                  end
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
              </code>
         | 
| 38 | 
            +
            </div>
         | 
    
        data/spec/code_spec.rb
    ADDED
    
    | @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            require 'markitdown'
         | 
| 2 | 
            +
            require 'spec_helper'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            describe Markitdown do  
         | 
| 5 | 
            +
              context "When parsing codeblocks" do
         | 
| 6 | 
            +
                let(:html) { File.read("spec/code.html") }
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                context "and not guessing the language" do
         | 
| 9 | 
            +
                  let(:markdown) { File.read("spec/code_without_language.markdown") }
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  it "should produce valid markdown" do
         | 
| 12 | 
            +
                    Markitdown.from_html(html).should == markdown
         | 
| 13 | 
            +
                  end
         | 
| 14 | 
            +
                end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                context "and guessing the language" do
         | 
| 17 | 
            +
                  let(:markdown) { File.read("spec/code_with_language.markdown") }
         | 
| 18 | 
            +
                  let(:classifier) { TestLanguageClassifier.new }
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                  it "should produce valid markdown" do
         | 
| 21 | 
            +
                    Markitdown.from_html(html, classifier).should == markdown
         | 
| 22 | 
            +
                  end
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
            end
         | 
| @@ -0,0 +1,40 @@ | |
| 1 | 
            +
             This is an html block
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            ```html
         | 
| 4 | 
            +
                <table>
         | 
| 5 | 
            +
                  <thead>
         | 
| 6 | 
            +
                    <tr>
         | 
| 7 | 
            +
                      <th>Column 1</th>
         | 
| 8 | 
            +
                      <th>Column 2</th>
         | 
| 9 | 
            +
                      <th>Column 3</th>
         | 
| 10 | 
            +
                    </tr>
         | 
| 11 | 
            +
                  </thead>
         | 
| 12 | 
            +
                  <tbody>
         | 
| 13 | 
            +
                    <tr>
         | 
| 14 | 
            +
                      <td>Value 1</td>
         | 
| 15 | 
            +
                      <td>Value 2</td>
         | 
| 16 | 
            +
                      <td>Value 3</td>
         | 
| 17 | 
            +
                    </tr>
         | 
| 18 | 
            +
                    <tr>
         | 
| 19 | 
            +
                      <td>Value 1a</td>
         | 
| 20 | 
            +
                      <td>Value 2a</td>
         | 
| 21 | 
            +
                      <td>Value 3a</td>
         | 
| 22 | 
            +
                    </tr>
         | 
| 23 | 
            +
                  </tbody>
         | 
| 24 | 
            +
                </table>
         | 
| 25 | 
            +
            ```
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            This is a ruby block
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            ```ruby
         | 
| 30 | 
            +
                # GET /blogs/1
         | 
| 31 | 
            +
                # GET /blogs/1.json
         | 
| 32 | 
            +
                def show
         | 
| 33 | 
            +
                  respond_to do |format|
         | 
| 34 | 
            +
                    format.html # show.html.erb
         | 
| 35 | 
            +
                    format.json { render json: @blog }
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
            ```
         | 
| 39 | 
            +
             | 
| 40 | 
            +
             
         | 
| @@ -0,0 +1,40 @@ | |
| 1 | 
            +
             This is an html block
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            ```
         | 
| 4 | 
            +
                <table>
         | 
| 5 | 
            +
                  <thead>
         | 
| 6 | 
            +
                    <tr>
         | 
| 7 | 
            +
                      <th>Column 1</th>
         | 
| 8 | 
            +
                      <th>Column 2</th>
         | 
| 9 | 
            +
                      <th>Column 3</th>
         | 
| 10 | 
            +
                    </tr>
         | 
| 11 | 
            +
                  </thead>
         | 
| 12 | 
            +
                  <tbody>
         | 
| 13 | 
            +
                    <tr>
         | 
| 14 | 
            +
                      <td>Value 1</td>
         | 
| 15 | 
            +
                      <td>Value 2</td>
         | 
| 16 | 
            +
                      <td>Value 3</td>
         | 
| 17 | 
            +
                    </tr>
         | 
| 18 | 
            +
                    <tr>
         | 
| 19 | 
            +
                      <td>Value 1a</td>
         | 
| 20 | 
            +
                      <td>Value 2a</td>
         | 
| 21 | 
            +
                      <td>Value 3a</td>
         | 
| 22 | 
            +
                    </tr>
         | 
| 23 | 
            +
                  </tbody>
         | 
| 24 | 
            +
                </table>
         | 
| 25 | 
            +
            ```
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            This is a ruby block
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            ```
         | 
| 30 | 
            +
                # GET /blogs/1
         | 
| 31 | 
            +
                # GET /blogs/1.json
         | 
| 32 | 
            +
                def show
         | 
| 33 | 
            +
                  respond_to do |format|
         | 
| 34 | 
            +
                    format.html # show.html.erb
         | 
| 35 | 
            +
                    format.json { render json: @blog }
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
            ```
         | 
| 39 | 
            +
             | 
| 40 | 
            +
             
         | 
    
        data/spec/doc_spec.rb
    CHANGED
    
    
    
        data/spec/nesting_spec.rb
    CHANGED
    
    
    
        data/spec/spec_helper.rb
    ADDED
    
    
    
        data/spec/table_spec.rb
    CHANGED
    
    
    
        data/spec/tag_spec.rb
    CHANGED
    
    | @@ -1,4 +1,5 @@ | |
| 1 1 | 
             
            require 'markitdown'
         | 
| 2 | 
            +
            require 'spec_helper'
         | 
| 2 3 |  | 
| 3 4 | 
             
            describe Markitdown do
         | 
| 4 5 | 
             
              context "When parsing a paragraph" do
         | 
| @@ -166,7 +167,7 @@ describe Markitdown do | |
| 166 167 |  | 
| 167 168 | 
             
                it "should return valid markdown with spaces" do
         | 
| 168 169 | 
             
                  pending "Still need to figure out leading spaces for <sup> elements"
         | 
| 169 | 
            -
                  Markitdown.from_html(html).should == "This ^(is a) test"
         | 
| 170 | 
            +
                  # Markitdown.from_html(html).should == "This ^(is a) test"
         | 
| 170 171 | 
             
                end
         | 
| 171 172 | 
             
              end
         | 
| 172 173 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: markitdown
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.3.0
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -9,7 +9,7 @@ authors: | |
| 9 9 | 
             
            autorequire: 
         | 
| 10 10 | 
             
            bindir: bin
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date: 2013-08- | 
| 12 | 
            +
            date: 2013-08-18 00:00:00.000000000 Z
         | 
| 13 13 | 
             
            dependencies:
         | 
| 14 14 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 15 | 
             
              name: nokogiri
         | 
| @@ -59,6 +59,22 @@ dependencies: | |
| 59 59 | 
             
                - - ! '>='
         | 
| 60 60 | 
             
                  - !ruby/object:Gem::Version
         | 
| 61 61 | 
             
                    version: '0'
         | 
| 62 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 63 | 
            +
              name: coveralls
         | 
| 64 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 65 | 
            +
                none: false
         | 
| 66 | 
            +
                requirements:
         | 
| 67 | 
            +
                - - ! '>='
         | 
| 68 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 69 | 
            +
                    version: '0'
         | 
| 70 | 
            +
              type: :development
         | 
| 71 | 
            +
              prerelease: false
         | 
| 72 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 73 | 
            +
                none: false
         | 
| 74 | 
            +
                requirements:
         | 
| 75 | 
            +
                - - ! '>='
         | 
| 76 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 77 | 
            +
                    version: '0'
         | 
| 62 78 | 
             
            description: A library that uses Nokogiri to parse HTML and produce Markdown
         | 
| 63 79 | 
             
            email:
         | 
| 64 80 | 
             
            - christopher.petersen@gmail.com
         | 
| @@ -77,12 +93,17 @@ files: | |
| 77 93 | 
             
            - markitdown.gemspec
         | 
| 78 94 | 
             
            - spec/asmartbear.html
         | 
| 79 95 | 
             
            - spec/asmartbear.markdown
         | 
| 96 | 
            +
            - spec/code.html
         | 
| 97 | 
            +
            - spec/code_spec.rb
         | 
| 98 | 
            +
            - spec/code_with_language.markdown
         | 
| 99 | 
            +
            - spec/code_without_language.markdown
         | 
| 80 100 | 
             
            - spec/doc.html
         | 
| 81 101 | 
             
            - spec/doc.markdown
         | 
| 82 102 | 
             
            - spec/doc_spec.rb
         | 
| 83 103 | 
             
            - spec/evernote.markdown
         | 
| 84 104 | 
             
            - spec/evernote.xml
         | 
| 85 105 | 
             
            - spec/nesting_spec.rb
         | 
| 106 | 
            +
            - spec/spec_helper.rb
         | 
| 86 107 | 
             
            - spec/table.html
         | 
| 87 108 | 
             
            - spec/table.markdown
         | 
| 88 109 | 
             
            - spec/table2.html
         | 
| @@ -103,7 +124,7 @@ required_ruby_version: !ruby/object:Gem::Requirement | |
| 103 124 | 
             
                  version: '0'
         | 
| 104 125 | 
             
                  segments:
         | 
| 105 126 | 
             
                  - 0
         | 
| 106 | 
            -
                  hash: - | 
| 127 | 
            +
                  hash: -1182280358588269194
         | 
| 107 128 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 108 129 | 
             
              none: false
         | 
| 109 130 | 
             
              requirements:
         | 
| @@ -112,7 +133,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 112 133 | 
             
                  version: '0'
         | 
| 113 134 | 
             
                  segments:
         | 
| 114 135 | 
             
                  - 0
         | 
| 115 | 
            -
                  hash: - | 
| 136 | 
            +
                  hash: -1182280358588269194
         | 
| 116 137 | 
             
            requirements: []
         | 
| 117 138 | 
             
            rubyforge_project: 
         | 
| 118 139 | 
             
            rubygems_version: 1.8.23
         | 
| @@ -122,12 +143,17 @@ summary: Converts HTML to Markdown | |
| 122 143 | 
             
            test_files:
         | 
| 123 144 | 
             
            - spec/asmartbear.html
         | 
| 124 145 | 
             
            - spec/asmartbear.markdown
         | 
| 146 | 
            +
            - spec/code.html
         | 
| 147 | 
            +
            - spec/code_spec.rb
         | 
| 148 | 
            +
            - spec/code_with_language.markdown
         | 
| 149 | 
            +
            - spec/code_without_language.markdown
         | 
| 125 150 | 
             
            - spec/doc.html
         | 
| 126 151 | 
             
            - spec/doc.markdown
         | 
| 127 152 | 
             
            - spec/doc_spec.rb
         | 
| 128 153 | 
             
            - spec/evernote.markdown
         | 
| 129 154 | 
             
            - spec/evernote.xml
         | 
| 130 155 | 
             
            - spec/nesting_spec.rb
         | 
| 156 | 
            +
            - spec/spec_helper.rb
         | 
| 131 157 | 
             
            - spec/table.html
         | 
| 132 158 | 
             
            - spec/table.markdown
         | 
| 133 159 | 
             
            - spec/table2.html
         |