omnivore 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/omnivore/document.rb +33 -0
- data/lib/omnivore/html_helper.rb +52 -0
- data/lib/omnivore/http_client.rb +1 -1
- data/lib/omnivore/version.rb +1 -1
- data/lib/omnivore.rb +3 -2
- data/omnivore.gemspec +2 -2
- data/spec/document_spec.rb +30 -0
- data/spec/html_helper_spec.rb +42 -0
- data/spec/http_client_spec.rb +1 -1
- metadata +18 -5
- data/lib/omnivore/xpath_extractor.rb +0 -12
- data/spec/xpath_extractor_spec.rb +0 -30
@@ -0,0 +1,33 @@
|
|
1
|
+
module Omnivore
|
2
|
+
require "omnivore/http_client"
|
3
|
+
require "omnivore/html_helper"
|
4
|
+
|
5
|
+
class Document
|
6
|
+
attr_reader :html
|
7
|
+
|
8
|
+
def self.from_url(url)
|
9
|
+
Document.new(HttpClient.get(url))
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.from_html(html)
|
13
|
+
Document.new(html)
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
def initialize(html)
|
18
|
+
@html = html
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def title
|
23
|
+
unless @title
|
24
|
+
matches = HtmlHelper.xpath(self.html, "/html/head/title")
|
25
|
+
@title = HtmlHelper.to_text(matches.first) || ""
|
26
|
+
end
|
27
|
+
@title
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
module Omnivore
|
4
|
+
module HtmlHelper
|
5
|
+
|
6
|
+
class HtmlTransformer
|
7
|
+
|
8
|
+
def initialize(html)
|
9
|
+
@html = html
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_text
|
13
|
+
document = Nokogiri::HTML.parse(@html)
|
14
|
+
partition(document, 'style', 'script').values.join(' ').strip.gsub(/\s+/, ' ')
|
15
|
+
end
|
16
|
+
|
17
|
+
def partition(node, *ignore_tags)
|
18
|
+
elements = { }
|
19
|
+
return elements if node.nil?
|
20
|
+
return elements if node.respond_to?('cdata?') and node.cdata?
|
21
|
+
return elements if node.respond_to?('comment?') and node.comment?
|
22
|
+
|
23
|
+
if node.kind_of?(Nokogiri::XML::Element) and ignore_tags and ignore_tags.size > 0
|
24
|
+
return elements if node.name =~ %r[#{ignore_tags.join('|')}]i
|
25
|
+
end
|
26
|
+
|
27
|
+
elements = { }
|
28
|
+
if node.kind_of?(Nokogiri::XML::Text)
|
29
|
+
elements[node.path.to_s] = node.text
|
30
|
+
return elements
|
31
|
+
end
|
32
|
+
node.children.each do |child|
|
33
|
+
elements.merge!(partition(child, *ignore_tags))
|
34
|
+
end
|
35
|
+
elements
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
def HtmlHelper.to_text(html)
|
41
|
+
transformer = HtmlTransformer.new(html)
|
42
|
+
transformer.to_text
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def HtmlHelper.xpath(html, xpath)
|
47
|
+
document = Nokogiri::HTML.parse(html)
|
48
|
+
document.xpath(xpath).map { |m| m.to_html }
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
data/lib/omnivore/http_client.rb
CHANGED
data/lib/omnivore/version.rb
CHANGED
data/lib/omnivore.rb
CHANGED
data/omnivore.gemspec
CHANGED
@@ -19,6 +19,6 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
|
21
21
|
# specify any dependencies here; for example:
|
22
|
-
s.add_development_dependency "rspec"
|
23
|
-
|
22
|
+
s.add_development_dependency "rspec", "~> 2.8.0"
|
23
|
+
s.add_runtime_dependency "nokogiri", "~> 1.5.0"
|
24
24
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'omnivore/document'
|
2
|
+
include Omnivore
|
3
|
+
|
4
|
+
describe Document do
|
5
|
+
|
6
|
+
STATIC_HTML = %{
|
7
|
+
<html>
|
8
|
+
<head>
|
9
|
+
<title>Nothing To See</title>
|
10
|
+
</head>
|
11
|
+
<body>
|
12
|
+
<p>Nothing to see here, move along.</p>
|
13
|
+
</body>
|
14
|
+
</html>
|
15
|
+
}
|
16
|
+
|
17
|
+
it "should fetch the content of the provided url" do
|
18
|
+
document = Document.from_url("http://www.google.com")
|
19
|
+
document.html.should_not be_nil
|
20
|
+
document.html.should_not be_empty
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should provide the document title" do
|
24
|
+
document = Document.from_html(STATIC_HTML)
|
25
|
+
document.title.should_not be_nil
|
26
|
+
document.title.should_not be_empty
|
27
|
+
document.title.should == "Nothing To See"
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "omnivore/html_helper"
|
2
|
+
include Omnivore
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
describe HtmlHelper do
|
7
|
+
|
8
|
+
it "should match the correct xpath" do
|
9
|
+
content = %{
|
10
|
+
<html>
|
11
|
+
<head></head>
|
12
|
+
<body>
|
13
|
+
<div class="banner">
|
14
|
+
I don't want to see this.
|
15
|
+
</div>
|
16
|
+
<div class="content">
|
17
|
+
This is what I want to see.
|
18
|
+
</div>
|
19
|
+
</body>
|
20
|
+
</html>
|
21
|
+
}
|
22
|
+
matches = HtmlHelper.xpath(content, "//div[@class=\"content\"]")
|
23
|
+
matches.size.should == 1
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
it "should be able to extract text from markup" do
|
28
|
+
html = %{
|
29
|
+
<p>
|
30
|
+
Content may contain some additional markup, such as:
|
31
|
+
<ul>
|
32
|
+
<li>Ordered or unordered lists,</li>
|
33
|
+
<li><a href="#">Hyperlinks,</a></li>
|
34
|
+
<li>and Images
|
35
|
+
</ul>
|
36
|
+
</p>
|
37
|
+
}
|
38
|
+
text = HtmlHelper.to_text(html)
|
39
|
+
text.should == "Content may contain some additional markup, such as: Ordered or unordered lists, Hyperlinks, and Images"
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
data/spec/http_client_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require 'omnivore/http_client'
|
|
3
3
|
describe Omnivore::HttpClient do
|
4
4
|
|
5
5
|
it "should fetch the content of a url" do
|
6
|
-
html = Omnivore::HttpClient.get("http://
|
6
|
+
html = Omnivore::HttpClient.get("http://linksmart.com")
|
7
7
|
html.should_not be_nil
|
8
8
|
html.should_not be_empty
|
9
9
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: omnivore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Matthias Eder
|
@@ -19,11 +19,22 @@ dependencies:
|
|
19
19
|
requirement: &id001 !ruby/object:Gem::Requirement
|
20
20
|
none: false
|
21
21
|
requirements:
|
22
|
-
- -
|
22
|
+
- - ~>
|
23
23
|
- !ruby/object:Gem::Version
|
24
|
-
version:
|
24
|
+
version: 2.8.0
|
25
25
|
type: :development
|
26
26
|
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ~>
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.5.0
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
27
38
|
description: A library for extracting content from HTML documents.
|
28
39
|
email:
|
29
40
|
- matthias@izume.com
|
@@ -40,12 +51,14 @@ files:
|
|
40
51
|
- README.md
|
41
52
|
- Rakefile
|
42
53
|
- lib/omnivore.rb
|
54
|
+
- lib/omnivore/document.rb
|
55
|
+
- lib/omnivore/html_helper.rb
|
43
56
|
- lib/omnivore/http_client.rb
|
44
57
|
- lib/omnivore/version.rb
|
45
|
-
- lib/omnivore/xpath_extractor.rb
|
46
58
|
- omnivore.gemspec
|
59
|
+
- spec/document_spec.rb
|
60
|
+
- spec/html_helper_spec.rb
|
47
61
|
- spec/http_client_spec.rb
|
48
|
-
- spec/xpath_extractor_spec.rb
|
49
62
|
has_rdoc: true
|
50
63
|
homepage: ""
|
51
64
|
licenses: []
|
@@ -1,30 +0,0 @@
|
|
1
|
-
require "omnivore/xpath_extractor"
|
2
|
-
|
3
|
-
CONTENT = %{
|
4
|
-
<html>
|
5
|
-
<head></head>
|
6
|
-
<body>
|
7
|
-
<div class="banner">
|
8
|
-
This is a banner
|
9
|
-
</div>
|
10
|
-
<div class="topnav">
|
11
|
-
<ul>
|
12
|
-
<li>Home</li>
|
13
|
-
<li>About</li>
|
14
|
-
</ul>
|
15
|
-
</div>
|
16
|
-
<div class="content">
|
17
|
-
This is where the real stuff is.
|
18
|
-
</div>
|
19
|
-
</body>
|
20
|
-
</html>
|
21
|
-
}
|
22
|
-
|
23
|
-
describe Omnivore::XPathExtractor do
|
24
|
-
|
25
|
-
it "should match the correct xpath" do
|
26
|
-
matches = Omnivore::XPathExtractor.match(CONTENT, "//div[@class=\"content\"]")
|
27
|
-
matches.size.should be > 0
|
28
|
-
end
|
29
|
-
|
30
|
-
end
|