RubyGems - omnivore - Versions diffs - 0.0.1 → 0.0.2 - Mend

omnivore 0.0.1 → 0.0.2

Files changed (12) hide show

data/lib/omnivore/document.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module Omnivore
+  require "omnivore/http_client"
+  require "omnivore/html_helper"
+  class Document
+    attr_reader :html
+    def self.from_url(url)
+      Document.new(HttpClient.get(url))
+    end
+    def self.from_html(html)
+      Document.new(html)
+    end
+    def initialize(html)
+      @html = html
+    end
+    def title
+      unless @title
+        matches = HtmlHelper.xpath(self.html, "/html/head/title")
+        @title = HtmlHelper.to_text(matches.first) || ""
+      end
+      @title
+    end
+  end
+end

data/lib/omnivore/html_helper.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require "nokogiri"
+module Omnivore
+  module HtmlHelper
+    class HtmlTransformer
+      def initialize(html)
+        @html = html
+      end
+      def to_text
+        document = Nokogiri::HTML.parse(@html)
+        partition(document, 'style', 'script').values.join(' ').strip.gsub(/\s+/, ' ')
+      end
+      def partition(node, *ignore_tags)
+        elements = { }
+        return elements if node.nil?
+        return elements if node.respond_to?('cdata?') and node.cdata?
+        return elements if node.respond_to?('comment?') and node.comment?
+        if node.kind_of?(Nokogiri::XML::Element) and ignore_tags and ignore_tags.size > 0
+          return elements if node.name =~ %r[#{ignore_tags.join('|')}]i
+        end
+        elements = { }
+        if node.kind_of?(Nokogiri::XML::Text)
+          elements[node.path.to_s] = node.text
+          return elements
+        end
+        node.children.each do |child|
+          elements.merge!(partition(child, *ignore_tags))
+        end
+        elements
+      end
+    end
+    def HtmlHelper.to_text(html)
+      transformer = HtmlTransformer.new(html)
+      transformer.to_text
+    end
+    def HtmlHelper.xpath(html, xpath)
+      document = Nokogiri::HTML.parse(html)
+      document.xpath(xpath).map { |m| m.to_html }
+    end
+  end
+end

data/lib/omnivore/http_client.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Omnivore
   class HttpClient
-    def HttpClient.get(url, attempts=3)
+    def self.get(url, attempts=3)
       raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
       response = Net::HTTP.get_response(URI.parse(url))

data/lib/omnivore/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Omnivore
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

data/lib/omnivore.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require "omnivore/version"
-require "omnivore/http_client"
-require "omnivore/xpath_extractor"
+require "omnivore/document"
+#require "omnivore/http_client"
+#require "omnivore/xpath_extractor"
 module Omnivore
   # Your code goes here...

data/omnivore.gemspec CHANGED Viewed

@@ -19,6 +19,6 @@ Gem::Specification.new do |s|
   s.require_paths = ["lib"]
   # specify any dependencies here; for example:
-  s.add_development_dependency "rspec"
-  # s.add_runtime_dependency "rest-client"
+  s.add_development_dependency "rspec", "~> 2.8.0"
+  s.add_runtime_dependency "nokogiri", "~> 1.5.0"
 end

data/spec/document_spec.rb ADDED Viewed

@@ -0,0 +1,30 @@
+require 'omnivore/document'
+include Omnivore
+describe Document do
+  STATIC_HTML = %{
+    <html>
+      <head>
+        <title>Nothing To See</title>
+      </head>
+      <body>
+        <p>Nothing to see here, move along.</p>
+      </body>
+    </html>
+  }
+  it "should fetch the content of the provided url" do
+    document = Document.from_url("http://www.google.com")
+    document.html.should_not be_nil
+    document.html.should_not be_empty
+  end
+  it "should provide the document title" do
+    document = Document.from_html(STATIC_HTML)
+    document.title.should_not be_nil
+    document.title.should_not be_empty
+    document.title.should == "Nothing To See"
+  end
+end

data/spec/html_helper_spec.rb ADDED Viewed

@@ -0,0 +1,42 @@
+require "omnivore/html_helper"
+include Omnivore
+describe HtmlHelper do
+  it "should match the correct xpath" do
+    content = %{
+      <html>
+        <head></head>
+        <body>
+          <div class="banner">
+            I don't want to see this.
+          </div>
+          <div class="content">
+            This is what I want to see.
+          </div>
+        </body>
+      </html>
+    }
+    matches = HtmlHelper.xpath(content, "//div[@class=\"content\"]")
+    matches.size.should == 1
+  end
+  it "should be able to extract text from markup" do
+    html = %{
+      <p>
+        Content may contain some additional markup, such as:
+        <ul>
+          <li>Ordered or unordered lists,</li>
+          <li><a href="#">Hyperlinks,</a></li>
+          <li>and Images
+        </ul>
+      </p>
+    }
+    text = HtmlHelper.to_text(html)
+    text.should == "Content may contain some additional markup, such as: Ordered or unordered lists, Hyperlinks, and Images"
+  end
+end

data/spec/http_client_spec.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'omnivore/http_client'
 describe Omnivore::HttpClient do
   it "should fetch the content of a url" do
-    html = Omnivore::HttpClient.get("http://blog.steveklabnik.com/posts/2011-09-28-real-modern-ruby-development")
+    html = Omnivore::HttpClient.get("http://linksmart.com")
     html.should_not be_nil
     html.should_not be_empty
   end

metadata CHANGED Viewed

@@ -2,7 +2,7 @@
 name: omnivore
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - Matthias Eder
@@ -19,11 +19,22 @@ dependencies:
   requirement: &id001 !ruby/object:Gem::Requirement
     none: false
     requirements:
-    - - ">="
+    - - ~>
       - !ruby/object:Gem::Version
-        version: "0"
+        version: 2.8.0
   type: :development
   version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+  type: :runtime
+  version_requirements: *id002
 description: A library for extracting content from HTML documents.
 email:
 - matthias@izume.com
@@ -40,12 +51,14 @@ files:
 - README.md
 - Rakefile
 - lib/omnivore.rb
+- lib/omnivore/document.rb
+- lib/omnivore/html_helper.rb
 - lib/omnivore/http_client.rb
 - lib/omnivore/version.rb
-- lib/omnivore/xpath_extractor.rb
 - omnivore.gemspec
+- spec/document_spec.rb
+- spec/html_helper_spec.rb
 - spec/http_client_spec.rb
-- spec/xpath_extractor_spec.rb
 has_rdoc: true
 homepage: ""
 licenses: []

data/lib/omnivore/xpath_extractor.rb DELETED Viewed

@@ -1,12 +0,0 @@
-require "rexml/document"
-module Omnivore
-  class XPathExtractor
-    def XPathExtractor.match(html, xpath)
-      xmldoc = REXML::Document.new(html)
-      REXML::XPath.match(xmldoc, xpath)
-    end
-  end
-end

data/spec/xpath_extractor_spec.rb DELETED Viewed

@@ -1,30 +0,0 @@
-require "omnivore/xpath_extractor"
-CONTENT = %{
-  <html>
-    <head></head>
-    <body>
-      <div class="banner">
-        This is a banner
-      </div>
-      <div class="topnav">
-        <ul>
-          <li>Home</li>
-          <li>About</li>
-        </ul>
-      </div>
-      <div class="content">
-        This is where the real stuff is.
-      </div>
-    </body>
-  </html>
-}
-describe Omnivore::XPathExtractor do
-  it "should match the correct xpath" do
-    matches = Omnivore::XPathExtractor.match(CONTENT, "//div[@class=\"content\"]")
-    matches.size.should be > 0
-  end
-end