RubyGems - omnivore - Versions diffs - 0.0.3 → 0.0.4 - Mend

omnivore 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/lib/omnivore/document.rb +6 -20
data/lib/omnivore/version.rb +1 -1
data/spec/document_spec.rb +7 -21
data/spec/fixtures/thia-breen-interview +2322 -0
metadata +3 -2

data/lib/omnivore/document.rb CHANGED Viewed

@@ -5,8 +5,8 @@ module Omnivore
   class Document
     attr_reader :model
-    CONTAINER_TAGS = %w[div p]
-    Paragraph = Struct.new("Block", :path, :html, :text)
+    BLOCK_TAGS = %w[div p frame bod]
+    Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
     def self.from_url(url)
@@ -45,39 +45,25 @@ module Omnivore
     def to_text
-      paragraphs = self.to_paragraphs.keep_if { |p| (p.text.size / p.html.size.to_f) > 0.1 }
+      paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
       paragraphs.map { |p| p.text }.join("\n")
     end
     def to_paragraphs
-      filter(self.model.xpath("/html/body")).map { |block|
+      self.model.xpath("//div|//p").map { |block|
         html = block.to_html.gsub(/\s+/, " ").strip
         text = flatten(block).inject([ ]) { |memo, node|
           memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
           memo
         }.join(" ")
-        Paragraph.new(block.path.to_s, html, text)
+        Paragraph.new(block.path.to_s, text, text.size / html.size.to_f)
       }
     end
     private
-    def filter(container)
-      elements = [ ]
-      container.children.each { |child|
-        if CONTAINER_TAGS.include?(child.name)
-          unless child.attr("class") =~ /comment/i
-            elements << child
-            elements += filter(child)
-          end
-        end
-      }
-      elements
-    end
     def flatten(block)
       elements = [ ]
       return elements if block.nil?
@@ -87,7 +73,7 @@ module Omnivore
         elements << block
       else
         block.children.each { |child|
-          unless %w[div p].include?(child.name)
+          unless BLOCK_TAGS.include?(child.name)
             elements += flatten(child)
           end
         }

data/lib/omnivore/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Omnivore
-  VERSION = "0.0.3"
+  VERSION = "0.0.4"
 end

data/spec/document_spec.rb CHANGED Viewed

@@ -2,20 +2,7 @@ require 'omnivore/document'
 include Omnivore
 describe Document do
-  STATIC_HTML = %{
-    <html>
-      <head>
-        <title>Nothing To See</title>
-        <meta name="description" content="This is a test page.">
-        <meta name="keywords" content="metadata, testing, kayne west">
-      </head>
-      <body>
-        <p>Nothing to see here, move along.</p>
-      </body>
-    </html>
-  }
+  html = File.open("spec/fixtures/thia-breen-interview", "r") { |f| f.readlines }.join("\n")
   it "should fetch the content of the provided url" do
     document = Document.from_url("http://www.google.com")
@@ -24,28 +11,27 @@ describe Document do
   it "should contain the document title" do
-    document = Document.from_html(STATIC_HTML)
+    document = Document.from_html(html)
     document.title.should_not be_nil
     document.title.should_not be_empty
-    document.title.should == "Nothing To See"
+    document.title.should == "Estee Lauder President Thia Breen Interview - Career Advice from Thia Breen - Marie Claire"
   end
   it "should contain the document metadata" do
-    document = Document.from_html(STATIC_HTML)
+    document = Document.from_html(html)
     document.metadata.should_not be_nil
     document.metadata.should_not be_empty
-    document.metadata["keywords"].split(",").first.strip.should == "metadata"
+    document.metadata["keywords"].split(",").first.strip.should == "career advice"
   end
   it "should be able to extract the main content and ignore navigation and ads." do
-    #document = Document.from_url("http://www.marieclaire.com/career-money/jobs/thia-breen-interview")
-    document = Document.from_html(STATIC_HTML)
+    document = Document.from_html(html)
     text = document.to_text
     text.should_not be_nil
     text.should_not be_empty
-    text.should == "Nothing to see here, move along."
   end
 end