RubyGems - omnivore - Versions diffs - 0.0.2 → 0.0.3 - Mend

omnivore 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/lib/omnivore/document.rb CHANGED Viewed

@@ -1,33 +1,100 @@
+require "nokogiri"
+require "omnivore/http_client"
 module Omnivore
-  require "omnivore/http_client"
-  require "omnivore/html_helper"
   class Document
-    attr_reader :html
+    attr_reader :model
+    CONTAINER_TAGS = %w[div p]
+    Paragraph = Struct.new("Block", :path, :html, :text)
     def self.from_url(url)
       Document.new(HttpClient.get(url))
     end
     def self.from_html(html)
       Document.new(html)
     end
     def initialize(html)
-      @html = html
+      @model = Nokogiri::HTML.parse(html) { |config|
+        config.options = Nokogiri::XML::ParseOptions::NOBLANKS
+      }
+    end
+    def to_html
+      self.model.to_html
     end
     def title
-      unless @title
-        matches = HtmlHelper.xpath(self.html, "/html/head/title")
-        @title = HtmlHelper.to_text(matches.first) || ""
-      end
-      @title
+      @title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
+    end
+    def metadata
+      @metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
+        memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
+        memo
+      }
     end
+    def to_text
+      paragraphs = self.to_paragraphs.keep_if { |p| (p.text.size / p.html.size.to_f) > 0.1 }
+      paragraphs.map { |p| p.text }.join("\n")
+    end
+    def to_paragraphs
+      filter(self.model.xpath("/html/body")).map { |block|
+        html = block.to_html.gsub(/\s+/, " ").strip
+        text = flatten(block).inject([ ]) { |memo, node|
+          memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
+          memo
+        }.join(" ")
+        Paragraph.new(block.path.to_s, html, text)
+      }
+    end
+    private
+    def filter(container)
+      elements = [ ]
+      container.children.each { |child|
+        if CONTAINER_TAGS.include?(child.name)
+          unless child.attr("class") =~ /comment/i
+            elements << child
+            elements += filter(child)
+          end
+        end
+      }
+      elements
+    end
+    def flatten(block)
+      elements = [ ]
+      return elements if block.nil?
+      return elements if block.respond_to?('cdata?') and block.cdata?
+      return elements if block.respond_to?('comment?') and block.comment?
+      if block.children.empty?
+        elements << block
+      else
+        block.children.each { |child|
+          unless %w[div p].include?(child.name)
+            elements += flatten(child)
+          end
+        }
+      end
+      elements
+    end
   end
 end

data/lib/omnivore/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Omnivore
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/spec/document_spec.rb CHANGED Viewed

@@ -7,6 +7,8 @@ describe Document do
     <html>
       <head>
         <title>Nothing To See</title>
+        <meta name="description" content="This is a test page.">
+        <meta name="keywords" content="metadata, testing, kayne west">
       </head>
       <body>
         <p>Nothing to see here, move along.</p>
@@ -14,17 +16,36 @@ describe Document do
     </html>
   }
   it "should fetch the content of the provided url" do
     document = Document.from_url("http://www.google.com")
-    document.html.should_not be_nil
-    document.html.should_not be_empty
+    document.to_html.should_not be_empty
   end
-  it "should provide the document title" do
+  it "should contain the document title" do
     document = Document.from_html(STATIC_HTML)
     document.title.should_not be_nil
     document.title.should_not be_empty
     document.title.should == "Nothing To See"
   end
+  it "should contain the document metadata" do
+    document = Document.from_html(STATIC_HTML)
+    document.metadata.should_not be_nil
+    document.metadata.should_not be_empty
+    document.metadata["keywords"].split(",").first.strip.should == "metadata"
+  end
+  it "should be able to extract the main content and ignore navigation and ads." do
+    #document = Document.from_url("http://www.marieclaire.com/career-money/jobs/thia-breen-interview")
+    document = Document.from_html(STATIC_HTML)
+    text = document.to_text
+    text.should_not be_nil
+    text.should_not be_empty
+    text.should == "Nothing to see here, move along."
+  end
 end

metadata CHANGED Viewed

@@ -2,7 +2,7 @@
 name: omnivore
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Matthias Eder
@@ -10,7 +10,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-01-05 00:00:00 -07:00
+date: 2012-01-10 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -52,12 +52,10 @@ files:
 - Rakefile
 - lib/omnivore.rb
 - lib/omnivore/document.rb
-- lib/omnivore/html_helper.rb
 - lib/omnivore/http_client.rb
 - lib/omnivore/version.rb
 - omnivore.gemspec
 - spec/document_spec.rb
-- spec/html_helper_spec.rb
 - spec/http_client_spec.rb
 has_rdoc: true
 homepage: ""

data/lib/omnivore/html_helper.rb DELETED Viewed

@@ -1,52 +0,0 @@
-require "nokogiri"
-module Omnivore
-  module HtmlHelper
-    class HtmlTransformer
-      def initialize(html)
-        @html = html
-      end
-      def to_text
-        document = Nokogiri::HTML.parse(@html)
-        partition(document, 'style', 'script').values.join(' ').strip.gsub(/\s+/, ' ')
-      end
-      def partition(node, *ignore_tags)
-        elements = { }
-        return elements if node.nil?
-        return elements if node.respond_to?('cdata?') and node.cdata?
-        return elements if node.respond_to?('comment?') and node.comment?
-        if node.kind_of?(Nokogiri::XML::Element) and ignore_tags and ignore_tags.size > 0
-          return elements if node.name =~ %r[#{ignore_tags.join('|')}]i
-        end
-        elements = { }
-        if node.kind_of?(Nokogiri::XML::Text)
-          elements[node.path.to_s] = node.text
-          return elements
-        end
-        node.children.each do |child|
-          elements.merge!(partition(child, *ignore_tags))
-        end
-        elements
-      end
-    end
-    def HtmlHelper.to_text(html)
-      transformer = HtmlTransformer.new(html)
-      transformer.to_text
-    end
-    def HtmlHelper.xpath(html, xpath)
-      document = Nokogiri::HTML.parse(html)
-      document.xpath(xpath).map { |m| m.to_html }
-    end
-  end
-end

data/spec/html_helper_spec.rb DELETED Viewed

@@ -1,42 +0,0 @@
-require "omnivore/html_helper"
-include Omnivore
-describe HtmlHelper do
-  it "should match the correct xpath" do
-    content = %{
-      <html>
-        <head></head>
-        <body>
-          <div class="banner">
-            I don't want to see this.
-          </div>
-          <div class="content">
-            This is what I want to see.
-          </div>
-        </body>
-      </html>
-    }
-    matches = HtmlHelper.xpath(content, "//div[@class=\"content\"]")
-    matches.size.should == 1
-  end
-  it "should be able to extract text from markup" do
-    html = %{
-      <p>
-        Content may contain some additional markup, such as:
-        <ul>
-          <li>Ordered or unordered lists,</li>
-          <li><a href="#">Hyperlinks,</a></li>
-          <li>and Images
-        </ul>
-      </p>
-    }
-    text = HtmlHelper.to_text(html)
-    text.should == "Content may contain some additional markup, such as: Ordered or unordered lists, Hyperlinks, and Images"
-  end
-end