omnivore 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ module Omnivore
2
+ require "omnivore/http_client"
3
+ require "omnivore/html_helper"
4
+
5
+ class Document
6
+ attr_reader :html
7
+
8
+ def self.from_url(url)
9
+ Document.new(HttpClient.get(url))
10
+ end
11
+
12
+ def self.from_html(html)
13
+ Document.new(html)
14
+ end
15
+
16
+
17
+ def initialize(html)
18
+ @html = html
19
+ end
20
+
21
+
22
+ def title
23
+ unless @title
24
+ matches = HtmlHelper.xpath(self.html, "/html/head/title")
25
+ @title = HtmlHelper.to_text(matches.first) || ""
26
+ end
27
+ @title
28
+ end
29
+
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,52 @@
1
+ require "nokogiri"
2
+
3
+ module Omnivore
4
+ module HtmlHelper
5
+
6
+ class HtmlTransformer
7
+
8
+ def initialize(html)
9
+ @html = html
10
+ end
11
+
12
+ def to_text
13
+ document = Nokogiri::HTML.parse(@html)
14
+ partition(document, 'style', 'script').values.join(' ').strip.gsub(/\s+/, ' ')
15
+ end
16
+
17
+ def partition(node, *ignore_tags)
18
+ elements = { }
19
+ return elements if node.nil?
20
+ return elements if node.respond_to?('cdata?') and node.cdata?
21
+ return elements if node.respond_to?('comment?') and node.comment?
22
+
23
+ if node.kind_of?(Nokogiri::XML::Element) and ignore_tags and ignore_tags.size > 0
24
+ return elements if node.name =~ %r[#{ignore_tags.join('|')}]i
25
+ end
26
+
27
+ elements = { }
28
+ if node.kind_of?(Nokogiri::XML::Text)
29
+ elements[node.path.to_s] = node.text
30
+ return elements
31
+ end
32
+ node.children.each do |child|
33
+ elements.merge!(partition(child, *ignore_tags))
34
+ end
35
+ elements
36
+ end
37
+
38
+ end
39
+
40
+ def HtmlHelper.to_text(html)
41
+ transformer = HtmlTransformer.new(html)
42
+ transformer.to_text
43
+ end
44
+
45
+
46
+ def HtmlHelper.xpath(html, xpath)
47
+ document = Nokogiri::HTML.parse(html)
48
+ document.xpath(xpath).map { |m| m.to_html }
49
+ end
50
+
51
+ end
52
+ end
@@ -5,7 +5,7 @@ module Omnivore
5
5
  class HttpClient
6
6
 
7
7
 
8
- def HttpClient.get(url, attempts=3)
8
+ def self.get(url, attempts=3)
9
9
  raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
10
10
 
11
11
  response = Net::HTTP.get_response(URI.parse(url))
@@ -1,3 +1,3 @@
1
1
  module Omnivore
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/omnivore.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "omnivore/version"
2
- require "omnivore/http_client"
3
- require "omnivore/xpath_extractor"
2
+ require "omnivore/document"
3
+ #require "omnivore/http_client"
4
+ #require "omnivore/xpath_extractor"
4
5
 
5
6
  module Omnivore
6
7
  # Your code goes here...
data/omnivore.gemspec CHANGED
@@ -19,6 +19,6 @@ Gem::Specification.new do |s|
19
19
  s.require_paths = ["lib"]
20
20
 
21
21
  # specify any dependencies here; for example:
22
- s.add_development_dependency "rspec"
23
- # s.add_runtime_dependency "rest-client"
22
+ s.add_development_dependency "rspec", "~> 2.8.0"
23
+ s.add_runtime_dependency "nokogiri", "~> 1.5.0"
24
24
  end
@@ -0,0 +1,30 @@
1
+ require 'omnivore/document'
2
+ include Omnivore
3
+
4
+ describe Document do
5
+
6
+ STATIC_HTML = %{
7
+ <html>
8
+ <head>
9
+ <title>Nothing To See</title>
10
+ </head>
11
+ <body>
12
+ <p>Nothing to see here, move along.</p>
13
+ </body>
14
+ </html>
15
+ }
16
+
17
+ it "should fetch the content of the provided url" do
18
+ document = Document.from_url("http://www.google.com")
19
+ document.html.should_not be_nil
20
+ document.html.should_not be_empty
21
+ end
22
+
23
+ it "should provide the document title" do
24
+ document = Document.from_html(STATIC_HTML)
25
+ document.title.should_not be_nil
26
+ document.title.should_not be_empty
27
+ document.title.should == "Nothing To See"
28
+ end
29
+
30
+ end
@@ -0,0 +1,42 @@
1
+ require "omnivore/html_helper"
2
+ include Omnivore
3
+
4
+
5
+
6
+ describe HtmlHelper do
7
+
8
+ it "should match the correct xpath" do
9
+ content = %{
10
+ <html>
11
+ <head></head>
12
+ <body>
13
+ <div class="banner">
14
+ I don't want to see this.
15
+ </div>
16
+ <div class="content">
17
+ This is what I want to see.
18
+ </div>
19
+ </body>
20
+ </html>
21
+ }
22
+ matches = HtmlHelper.xpath(content, "//div[@class=\"content\"]")
23
+ matches.size.should == 1
24
+ end
25
+
26
+
27
+ it "should be able to extract text from markup" do
28
+ html = %{
29
+ <p>
30
+ Content may contain some additional markup, such as:
31
+ <ul>
32
+ <li>Ordered or unordered lists,</li>
33
+ <li><a href="#">Hyperlinks,</a></li>
34
+ <li>and Images
35
+ </ul>
36
+ </p>
37
+ }
38
+ text = HtmlHelper.to_text(html)
39
+ text.should == "Content may contain some additional markup, such as: Ordered or unordered lists, Hyperlinks, and Images"
40
+ end
41
+
42
+ end
@@ -3,7 +3,7 @@ require 'omnivore/http_client'
3
3
  describe Omnivore::HttpClient do
4
4
 
5
5
  it "should fetch the content of a url" do
6
- html = Omnivore::HttpClient.get("http://blog.steveklabnik.com/posts/2011-09-28-real-modern-ruby-development")
6
+ html = Omnivore::HttpClient.get("http://linksmart.com")
7
7
  html.should_not be_nil
8
8
  html.should_not be_empty
9
9
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: omnivore
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.1
5
+ version: 0.0.2
6
6
  platform: ruby
7
7
  authors:
8
8
  - Matthias Eder
@@ -19,11 +19,22 @@ dependencies:
19
19
  requirement: &id001 !ruby/object:Gem::Requirement
20
20
  none: false
21
21
  requirements:
22
- - - ">="
22
+ - - ~>
23
23
  - !ruby/object:Gem::Version
24
- version: "0"
24
+ version: 2.8.0
25
25
  type: :development
26
26
  version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: 1.5.0
36
+ type: :runtime
37
+ version_requirements: *id002
27
38
  description: A library for extracting content from HTML documents.
28
39
  email:
29
40
  - matthias@izume.com
@@ -40,12 +51,14 @@ files:
40
51
  - README.md
41
52
  - Rakefile
42
53
  - lib/omnivore.rb
54
+ - lib/omnivore/document.rb
55
+ - lib/omnivore/html_helper.rb
43
56
  - lib/omnivore/http_client.rb
44
57
  - lib/omnivore/version.rb
45
- - lib/omnivore/xpath_extractor.rb
46
58
  - omnivore.gemspec
59
+ - spec/document_spec.rb
60
+ - spec/html_helper_spec.rb
47
61
  - spec/http_client_spec.rb
48
- - spec/xpath_extractor_spec.rb
49
62
  has_rdoc: true
50
63
  homepage: ""
51
64
  licenses: []
@@ -1,12 +0,0 @@
1
- require "rexml/document"
2
-
3
- module Omnivore
4
- class XPathExtractor
5
-
6
- def XPathExtractor.match(html, xpath)
7
- xmldoc = REXML::Document.new(html)
8
- REXML::XPath.match(xmldoc, xpath)
9
- end
10
-
11
- end
12
- end
@@ -1,30 +0,0 @@
1
- require "omnivore/xpath_extractor"
2
-
3
- CONTENT = %{
4
- <html>
5
- <head></head>
6
- <body>
7
- <div class="banner">
8
- This is a banner
9
- </div>
10
- <div class="topnav">
11
- <ul>
12
- <li>Home</li>
13
- <li>About</li>
14
- </ul>
15
- </div>
16
- <div class="content">
17
- This is where the real stuff is.
18
- </div>
19
- </body>
20
- </html>
21
- }
22
-
23
- describe Omnivore::XPathExtractor do
24
-
25
- it "should match the correct xpath" do
26
- matches = Omnivore::XPathExtractor.match(CONTENT, "//div[@class=\"content\"]")
27
- matches.size.should be > 0
28
- end
29
-
30
- end