RubyGems - omnivore - Versions diffs - 0.0.3 → 0.0.4 - Mend

omnivore 0.0.3 → 0.0.4

Files changed (5) hide show

data/lib/omnivore/document.rb +6 -20
data/lib/omnivore/version.rb +1 -1
data/spec/document_spec.rb +7 -21
data/spec/fixtures/thia-breen-interview +2322 -0
metadata +3 -2

data/lib/omnivore/document.rb CHANGED Viewed

@@ -5,8 +5,8 @@ module Omnivore
   class Document
     attr_reader :model
-    CONTAINER_TAGS = %w[div p]
-    Paragraph = Struct.new("Block", :path, :html, :text)
+    BLOCK_TAGS = %w[div p frame bod]
+    Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
     def self.from_url(url)
@@ -45,39 +45,25 @@ module Omnivore
     def to_text
-      paragraphs = self.to_paragraphs.keep_if { |p| (p.text.size / p.html.size.to_f) > 0.1 }
+      paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
       paragraphs.map { |p| p.text }.join("\n")
     end
     def to_paragraphs
-      filter(self.model.xpath("/html/body")).map { |block|
+      self.model.xpath("//div|//p").map { |block|
         html = block.to_html.gsub(/\s+/, " ").strip
         text = flatten(block).inject([ ]) { |memo, node|
           memo << node.text.gsub(/\s+/, " ").strip if node.kind_of?(Nokogiri::XML::Text)
           memo
         }.join(" ")
-        Paragraph.new(block.path.to_s, html, text)
+        Paragraph.new(block.path.to_s, text, text.size / html.size.to_f)
       }
     end
     private
-    def filter(container)
-      elements = [ ]
-      container.children.each { |child|
-        if CONTAINER_TAGS.include?(child.name)
-          unless child.attr("class") =~ /comment/i
-            elements << child
-            elements += filter(child)
-          end
-        end
-      }
-      elements
-    end
     def flatten(block)
       elements = [ ]
       return elements if block.nil?
@@ -87,7 +73,7 @@ module Omnivore
         elements << block
       else
         block.children.each { |child|
-          unless %w[div p].include?(child.name)
+          unless BLOCK_TAGS.include?(child.name)
             elements += flatten(child)
           end
         }

data/lib/omnivore/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Omnivore
-  VERSION = "0.0.3"
+  VERSION = "0.0.4"
 end

data/spec/document_spec.rb CHANGED Viewed

@@ -2,20 +2,7 @@ require 'omnivore/document'
 include Omnivore
 describe Document do
-  STATIC_HTML = %{
-    <html>
-      <head>
-        <title>Nothing To See</title>
-        <meta name="description" content="This is a test page.">
-        <meta name="keywords" content="metadata, testing, kayne west">
-      </head>
-      <body>
-        <p>Nothing to see here, move along.</p>
-      </body>
-    </html>
-  }
+  html = File.open("spec/fixtures/thia-breen-interview", "r") { |f| f.readlines }.join("\n")
   it "should fetch the content of the provided url" do
     document = Document.from_url("http://www.google.com")
@@ -24,28 +11,27 @@ describe Document do
   it "should contain the document title" do
-    document = Document.from_html(STATIC_HTML)
+    document = Document.from_html(html)
     document.title.should_not be_nil
     document.title.should_not be_empty
-    document.title.should == "Nothing To See"
+    document.title.should == "Estee Lauder President Thia Breen Interview - Career Advice from Thia Breen - Marie Claire"
   end
   it "should contain the document metadata" do
-    document = Document.from_html(STATIC_HTML)
+    document = Document.from_html(html)
     document.metadata.should_not be_nil
     document.metadata.should_not be_empty
-    document.metadata["keywords"].split(",").first.strip.should == "metadata"
+    document.metadata["keywords"].split(",").first.strip.should == "career advice"
   end
   it "should be able to extract the main content and ignore navigation and ads." do
-    #document = Document.from_url("http://www.marieclaire.com/career-money/jobs/thia-breen-interview")
-    document = Document.from_html(STATIC_HTML)
+    document = Document.from_html(html)
     text = document.to_text
     text.should_not be_nil
     text.should_not be_empty
-    text.should == "Nothing to see here, move along."
   end
 end