omnivore 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,33 @@
1
+ module Omnivore
2
+ require "omnivore/http_client"
3
+ require "omnivore/html_helper"
4
+
5
+ class Document
6
+ attr_reader :html
7
+
8
+ def self.from_url(url)
9
+ Document.new(HttpClient.get(url))
10
+ end
11
+
12
+ def self.from_html(html)
13
+ Document.new(html)
14
+ end
15
+
16
+
17
+ def initialize(html)
18
+ @html = html
19
+ end
20
+
21
+
22
+ def title
23
+ unless @title
24
+ matches = HtmlHelper.xpath(self.html, "/html/head/title")
25
+ @title = HtmlHelper.to_text(matches.first) || ""
26
+ end
27
+ @title
28
+ end
29
+
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,52 @@
1
+ require "nokogiri"
2
+
3
+ module Omnivore
4
+ module HtmlHelper
5
+
6
+ class HtmlTransformer
7
+
8
+ def initialize(html)
9
+ @html = html
10
+ end
11
+
12
+ def to_text
13
+ document = Nokogiri::HTML.parse(@html)
14
+ partition(document, 'style', 'script').values.join(' ').strip.gsub(/\s+/, ' ')
15
+ end
16
+
17
+ def partition(node, *ignore_tags)
18
+ elements = { }
19
+ return elements if node.nil?
20
+ return elements if node.respond_to?('cdata?') and node.cdata?
21
+ return elements if node.respond_to?('comment?') and node.comment?
22
+
23
+ if node.kind_of?(Nokogiri::XML::Element) and ignore_tags and ignore_tags.size > 0
24
+ return elements if node.name =~ %r[#{ignore_tags.join('|')}]i
25
+ end
26
+
27
+ elements = { }
28
+ if node.kind_of?(Nokogiri::XML::Text)
29
+ elements[node.path.to_s] = node.text
30
+ return elements
31
+ end
32
+ node.children.each do |child|
33
+ elements.merge!(partition(child, *ignore_tags))
34
+ end
35
+ elements
36
+ end
37
+
38
+ end
39
+
40
+ def HtmlHelper.to_text(html)
41
+ transformer = HtmlTransformer.new(html)
42
+ transformer.to_text
43
+ end
44
+
45
+
46
+ def HtmlHelper.xpath(html, xpath)
47
+ document = Nokogiri::HTML.parse(html)
48
+ document.xpath(xpath).map { |m| m.to_html }
49
+ end
50
+
51
+ end
52
+ end
@@ -5,7 +5,7 @@ module Omnivore
5
5
  class HttpClient
6
6
 
7
7
 
8
- def HttpClient.get(url, attempts=3)
8
+ def self.get(url, attempts=3)
9
9
  raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
10
10
 
11
11
  response = Net::HTTP.get_response(URI.parse(url))
@@ -1,3 +1,3 @@
1
1
  module Omnivore
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/omnivore.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "omnivore/version"
2
- require "omnivore/http_client"
3
- require "omnivore/xpath_extractor"
2
+ require "omnivore/document"
3
+ #require "omnivore/http_client"
4
+ #require "omnivore/xpath_extractor"
4
5
 
5
6
  module Omnivore
6
7
  # Your code goes here...
data/omnivore.gemspec CHANGED
@@ -19,6 +19,6 @@ Gem::Specification.new do |s|
19
19
  s.require_paths = ["lib"]
20
20
 
21
21
  # specify any dependencies here; for example:
22
- s.add_development_dependency "rspec"
23
- # s.add_runtime_dependency "rest-client"
22
+ s.add_development_dependency "rspec", "~> 2.8.0"
23
+ s.add_runtime_dependency "nokogiri", "~> 1.5.0"
24
24
  end
@@ -0,0 +1,30 @@
1
+ require 'omnivore/document'
2
+ include Omnivore
3
+
4
+ describe Document do
5
+
6
+ STATIC_HTML = %{
7
+ <html>
8
+ <head>
9
+ <title>Nothing To See</title>
10
+ </head>
11
+ <body>
12
+ <p>Nothing to see here, move along.</p>
13
+ </body>
14
+ </html>
15
+ }
16
+
17
+ it "should fetch the content of the provided url" do
18
+ document = Document.from_url("http://www.google.com")
19
+ document.html.should_not be_nil
20
+ document.html.should_not be_empty
21
+ end
22
+
23
+ it "should provide the document title" do
24
+ document = Document.from_html(STATIC_HTML)
25
+ document.title.should_not be_nil
26
+ document.title.should_not be_empty
27
+ document.title.should == "Nothing To See"
28
+ end
29
+
30
+ end
@@ -0,0 +1,42 @@
1
+ require "omnivore/html_helper"
2
+ include Omnivore
3
+
4
+
5
+
6
+ describe HtmlHelper do
7
+
8
+ it "should match the correct xpath" do
9
+ content = %{
10
+ <html>
11
+ <head></head>
12
+ <body>
13
+ <div class="banner">
14
+ I don't want to see this.
15
+ </div>
16
+ <div class="content">
17
+ This is what I want to see.
18
+ </div>
19
+ </body>
20
+ </html>
21
+ }
22
+ matches = HtmlHelper.xpath(content, "//div[@class=\"content\"]")
23
+ matches.size.should == 1
24
+ end
25
+
26
+
27
+ it "should be able to extract text from markup" do
28
+ html = %{
29
+ <p>
30
+ Content may contain some additional markup, such as:
31
+ <ul>
32
+ <li>Ordered or unordered lists,</li>
33
+ <li><a href="#">Hyperlinks,</a></li>
34
+ <li>and Images
35
+ </ul>
36
+ </p>
37
+ }
38
+ text = HtmlHelper.to_text(html)
39
+ text.should == "Content may contain some additional markup, such as: Ordered or unordered lists, Hyperlinks, and Images"
40
+ end
41
+
42
+ end
@@ -3,7 +3,7 @@ require 'omnivore/http_client'
3
3
  describe Omnivore::HttpClient do
4
4
 
5
5
  it "should fetch the content of a url" do
6
- html = Omnivore::HttpClient.get("http://blog.steveklabnik.com/posts/2011-09-28-real-modern-ruby-development")
6
+ html = Omnivore::HttpClient.get("http://linksmart.com")
7
7
  html.should_not be_nil
8
8
  html.should_not be_empty
9
9
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: omnivore
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.1
5
+ version: 0.0.2
6
6
  platform: ruby
7
7
  authors:
8
8
  - Matthias Eder
@@ -19,11 +19,22 @@ dependencies:
19
19
  requirement: &id001 !ruby/object:Gem::Requirement
20
20
  none: false
21
21
  requirements:
22
- - - ">="
22
+ - - ~>
23
23
  - !ruby/object:Gem::Version
24
- version: "0"
24
+ version: 2.8.0
25
25
  type: :development
26
26
  version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: 1.5.0
36
+ type: :runtime
37
+ version_requirements: *id002
27
38
  description: A library for extracting content from HTML documents.
28
39
  email:
29
40
  - matthias@izume.com
@@ -40,12 +51,14 @@ files:
40
51
  - README.md
41
52
  - Rakefile
42
53
  - lib/omnivore.rb
54
+ - lib/omnivore/document.rb
55
+ - lib/omnivore/html_helper.rb
43
56
  - lib/omnivore/http_client.rb
44
57
  - lib/omnivore/version.rb
45
- - lib/omnivore/xpath_extractor.rb
46
58
  - omnivore.gemspec
59
+ - spec/document_spec.rb
60
+ - spec/html_helper_spec.rb
47
61
  - spec/http_client_spec.rb
48
- - spec/xpath_extractor_spec.rb
49
62
  has_rdoc: true
50
63
  homepage: ""
51
64
  licenses: []
@@ -1,12 +0,0 @@
1
- require "rexml/document"
2
-
3
- module Omnivore
4
- class XPathExtractor
5
-
6
- def XPathExtractor.match(html, xpath)
7
- xmldoc = REXML::Document.new(html)
8
- REXML::XPath.match(xmldoc, xpath)
9
- end
10
-
11
- end
12
- end
@@ -1,30 +0,0 @@
1
- require "omnivore/xpath_extractor"
2
-
3
- CONTENT = %{
4
- <html>
5
- <head></head>
6
- <body>
7
- <div class="banner">
8
- This is a banner
9
- </div>
10
- <div class="topnav">
11
- <ul>
12
- <li>Home</li>
13
- <li>About</li>
14
- </ul>
15
- </div>
16
- <div class="content">
17
- This is where the real stuff is.
18
- </div>
19
- </body>
20
- </html>
21
- }
22
-
23
- describe Omnivore::XPathExtractor do
24
-
25
- it "should match the correct xpath" do
26
- matches = Omnivore::XPathExtractor.match(CONTENT, "//div[@class=\"content\"]")
27
- matches.size.should be > 0
28
- end
29
-
30
- end