omnivore 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/README.md +18 -1
- data/lib/omnivore/document.rb +40 -10
- data/lib/omnivore/http_client.rb +11 -5
- data/lib/omnivore/version.rb +1 -1
- data/omnivore.gemspec +5 -2
- metadata +62 -10
- data/.rvmrc +0 -7
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1 +1,18 @@
|
|
1
|
-
|
1
|
+
## Omnivore: a library for decrufting HTML documents
|
2
|
+
|
3
|
+
Omnivore is a library for extracting "real" content from HTML documents. Currently, the approach is limited to
|
4
|
+
analysing text density to distiguish relevant sections from navigation, advertising, and other non-relevant elements. As
|
5
|
+
such, the results are far from perfect but will hopefully improve as more sophisticated features are added.
|
6
|
+
|
7
|
+
### INSTALL
|
8
|
+
```
|
9
|
+
sudo gem install omnivore
|
10
|
+
```
|
11
|
+
|
12
|
+
### EXAMPLE
|
13
|
+
```ruby
|
14
|
+
require 'omnivore'
|
15
|
+
document = Omnivore::Document.from_url('http://www.slashgear.com/sennheiser-hd-700-hands-on-10208572')
|
16
|
+
puts document.to_text
|
17
|
+
```
|
18
|
+
|
data/lib/omnivore/document.rb
CHANGED
@@ -3,17 +3,29 @@ require "omnivore/http_client"
|
|
3
3
|
|
4
4
|
module Omnivore
|
5
5
|
|
6
|
+
# A class encapsulating an HTML document.
|
6
7
|
class Document
|
7
8
|
attr_reader :model
|
8
|
-
|
9
|
+
|
10
|
+
# The HTML tags signaling the start of a block or paragraph.
|
11
|
+
BLOCK_TAGS = %w[div p frame]
|
12
|
+
|
13
|
+
# A Struct descibing a paragraph, including it's :path in the document, :text,
|
14
|
+
# and various metrics, such as :text_density.
|
9
15
|
Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
|
10
16
|
|
11
17
|
|
18
|
+
# Creates a Omnivore::Document object from a url.
|
19
|
+
# @param [String] url the document's url
|
20
|
+
# @return [Document] A new Document object.
|
12
21
|
def self.from_url(url)
|
13
22
|
Document.new(HttpClient.get(url))
|
14
23
|
end
|
15
24
|
|
16
25
|
|
26
|
+
# Creates a Omnivore::Document object from a string containing HTML.
|
27
|
+
# @param [String] html the HTML content
|
28
|
+
# @return [Document] A new Document object.
|
17
29
|
def self.from_html(html)
|
18
30
|
Document.new(html)
|
19
31
|
end
|
@@ -26,16 +38,22 @@ module Omnivore
|
|
26
38
|
end
|
27
39
|
|
28
40
|
|
41
|
+
# A HTML representation of the document.
|
42
|
+
# @return [String] A HTML representation of the document.
|
29
43
|
def to_html
|
30
44
|
self.model.to_html
|
31
45
|
end
|
32
46
|
|
33
47
|
|
48
|
+
# Extracts the document title.
|
49
|
+
# @return [String] The document title.
|
34
50
|
def title
|
35
51
|
@title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
|
36
52
|
end
|
37
53
|
|
38
54
|
|
55
|
+
# Extracts document metadata.
|
56
|
+
# @return [Hash] The metadata tags found in the document.
|
39
57
|
def metadata
|
40
58
|
@metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
|
41
59
|
memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
|
@@ -44,12 +62,19 @@ module Omnivore
|
|
44
62
|
end
|
45
63
|
|
46
64
|
|
65
|
+
# Returns the actual content of the document, without navigation, advertising, etc.
|
66
|
+
# @return [String] The document's main content.
|
47
67
|
def to_text
|
48
|
-
|
49
|
-
|
68
|
+
self.to_paragraphs.inject([ ]) { |buffer, p|
|
69
|
+
buffer << p.text if p.text_density >= 0.25
|
70
|
+
buffer
|
71
|
+
}.join("\n")
|
50
72
|
end
|
51
73
|
|
52
74
|
|
75
|
+
# Splits the document into paragraphs, assuming that each <div> or <p> tag represents
|
76
|
+
# a paragraph.
|
77
|
+
# @return [Array] An array of Paragraph objects.
|
53
78
|
def to_paragraphs
|
54
79
|
self.model.xpath("//div|//p").map { |block|
|
55
80
|
html = block.to_html.gsub(/\s+/, " ").strip
|
@@ -64,15 +89,20 @@ module Omnivore
|
|
64
89
|
|
65
90
|
private
|
66
91
|
|
67
|
-
|
92
|
+
# A convenience method that recursively iterates over a document node and returns
|
93
|
+
# an array of all of it's children, with the exception of other block elements
|
94
|
+
# (e.g div or p nodes).
|
95
|
+
# @param [Nokogiri::XML::Node] node the root node
|
96
|
+
# @return [Array] The Nokogiri::XML::Node objects contained in the root.
|
97
|
+
def flatten(node)
|
68
98
|
elements = [ ]
|
69
|
-
return elements if
|
70
|
-
return elements if
|
71
|
-
return elements if
|
72
|
-
if
|
73
|
-
elements <<
|
99
|
+
return elements if node.nil?
|
100
|
+
return elements if node.respond_to?('cdata?') and node.cdata?
|
101
|
+
return elements if node.respond_to?('comment?') and node.comment?
|
102
|
+
if node.children.empty?
|
103
|
+
elements << node
|
74
104
|
else
|
75
|
-
|
105
|
+
node.children.each { |child|
|
76
106
|
unless BLOCK_TAGS.include?(child.name)
|
77
107
|
elements += flatten(child)
|
78
108
|
end
|
data/lib/omnivore/http_client.rb
CHANGED
@@ -2,16 +2,22 @@ require 'net/http'
|
|
2
2
|
require 'uri'
|
3
3
|
|
4
4
|
module Omnivore
|
5
|
+
# A simple HTTP client with a redirect feature.
|
6
|
+
#
|
5
7
|
class HttpClient
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
# Sends a `GET` request to the specified url, following the provided number of
|
10
|
+
# maximum redirects.
|
11
|
+
#
|
12
|
+
# @param [String] url the url to be requested
|
13
|
+
# @param [Integer] redirects the number of redirects to follow
|
14
|
+
# @return [String] the response body of the request.
|
15
|
+
def self.get(url, redirects=3)
|
16
|
+
raise ArgumentError, 'HTTP redirect too deep' if redirects == 0
|
11
17
|
response = Net::HTTP.get_response(URI.parse(url))
|
12
18
|
case response
|
13
19
|
when Net::HTTPSuccess then response.body
|
14
|
-
when Net::HTTPRedirection then HttpClient.get(response['location'],
|
20
|
+
when Net::HTTPRedirection then HttpClient.get(response['location'], redirects - 1)
|
15
21
|
else
|
16
22
|
response.error!
|
17
23
|
end
|
data/lib/omnivore/version.rb
CHANGED
data/omnivore.gemspec
CHANGED
@@ -5,10 +5,11 @@ require "omnivore/version"
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = "omnivore"
|
7
7
|
s.version = Omnivore::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
8
9
|
s.authors = ["Matthias Eder"]
|
9
10
|
s.email = ["matthias@izume.com"]
|
10
|
-
s.homepage = ""
|
11
|
-
s.summary = %q{Content
|
11
|
+
s.homepage = "http://github.com/matthiase/omnivore"
|
12
|
+
s.summary = %q{Content Extraction and Analysis Library}
|
12
13
|
s.description = %q{A library for extracting content from HTML documents.}
|
13
14
|
|
14
15
|
s.rubyforge_project = "omnivore"
|
@@ -19,6 +20,8 @@ Gem::Specification.new do |s|
|
|
19
20
|
s.require_paths = ["lib"]
|
20
21
|
|
21
22
|
# specify any dependencies here; for example:
|
23
|
+
s.add_development_dependency "yard", "~> 0.7.4"
|
24
|
+
s.add_development_dependency "redcarpet", "~> 2.0.1"
|
22
25
|
s.add_development_dependency "rspec", "~> 2.8.0"
|
23
26
|
s.add_runtime_dependency "nokogiri", "~> 1.5.0"
|
24
27
|
end
|
metadata
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omnivore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
6
11
|
platform: ruby
|
7
12
|
authors:
|
8
13
|
- Matthias Eder
|
@@ -14,27 +19,69 @@ date: 2012-01-11 00:00:00 -07:00
|
|
14
19
|
default_executable:
|
15
20
|
dependencies:
|
16
21
|
- !ruby/object:Gem::Dependency
|
17
|
-
name:
|
22
|
+
name: yard
|
18
23
|
prerelease: false
|
19
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
20
25
|
none: false
|
21
26
|
requirements:
|
22
27
|
- - ~>
|
23
28
|
- !ruby/object:Gem::Version
|
24
|
-
|
29
|
+
hash: 11
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 7
|
33
|
+
- 4
|
34
|
+
version: 0.7.4
|
25
35
|
type: :development
|
26
36
|
version_requirements: *id001
|
27
37
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
38
|
+
name: redcarpet
|
29
39
|
prerelease: false
|
30
40
|
requirement: &id002 !ruby/object:Gem::Requirement
|
31
41
|
none: false
|
32
42
|
requirements:
|
33
43
|
- - ~>
|
34
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 13
|
46
|
+
segments:
|
47
|
+
- 2
|
48
|
+
- 0
|
49
|
+
- 1
|
50
|
+
version: 2.0.1
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: rspec
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 47
|
62
|
+
segments:
|
63
|
+
- 2
|
64
|
+
- 8
|
65
|
+
- 0
|
66
|
+
version: 2.8.0
|
67
|
+
type: :development
|
68
|
+
version_requirements: *id003
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
prerelease: false
|
72
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
hash: 3
|
78
|
+
segments:
|
79
|
+
- 1
|
80
|
+
- 5
|
81
|
+
- 0
|
35
82
|
version: 1.5.0
|
36
83
|
type: :runtime
|
37
|
-
version_requirements: *
|
84
|
+
version_requirements: *id004
|
38
85
|
description: A library for extracting content from HTML documents.
|
39
86
|
email:
|
40
87
|
- matthias@izume.com
|
@@ -46,7 +93,6 @@ extra_rdoc_files: []
|
|
46
93
|
|
47
94
|
files:
|
48
95
|
- .gitignore
|
49
|
-
- .rvmrc
|
50
96
|
- Gemfile
|
51
97
|
- README.md
|
52
98
|
- Rakefile
|
@@ -59,7 +105,7 @@ files:
|
|
59
105
|
- spec/fixtures/thia-breen-interview
|
60
106
|
- spec/http_client_spec.rb
|
61
107
|
has_rdoc: true
|
62
|
-
homepage:
|
108
|
+
homepage: http://github.com/matthiase/omnivore
|
63
109
|
licenses: []
|
64
110
|
|
65
111
|
post_install_message:
|
@@ -72,19 +118,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
72
118
|
requirements:
|
73
119
|
- - ">="
|
74
120
|
- !ruby/object:Gem::Version
|
121
|
+
hash: 3
|
122
|
+
segments:
|
123
|
+
- 0
|
75
124
|
version: "0"
|
76
125
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
126
|
none: false
|
78
127
|
requirements:
|
79
128
|
- - ">="
|
80
129
|
- !ruby/object:Gem::Version
|
130
|
+
hash: 3
|
131
|
+
segments:
|
132
|
+
- 0
|
81
133
|
version: "0"
|
82
134
|
requirements: []
|
83
135
|
|
84
136
|
rubyforge_project: omnivore
|
85
|
-
rubygems_version: 1.
|
137
|
+
rubygems_version: 1.3.7
|
86
138
|
signing_key:
|
87
139
|
specification_version: 3
|
88
|
-
summary: Content
|
140
|
+
summary: Content Extraction and Analysis Library
|
89
141
|
test_files: []
|
90
142
|
|
data/.rvmrc
DELETED