omnivore 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/README.md +18 -1
- data/lib/omnivore/document.rb +40 -10
- data/lib/omnivore/http_client.rb +11 -5
- data/lib/omnivore/version.rb +1 -1
- data/omnivore.gemspec +5 -2
- metadata +62 -10
- data/.rvmrc +0 -7
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1 +1,18 @@
|
|
1
|
-
|
1
|
+
## Omnivore: a library for decrufting HTML documents
|
2
|
+
|
3
|
+
Omnivore is a library for extracting "real" content from HTML documents. Currently, the approach is limited to
|
4
|
+
analysing text density to distiguish relevant sections from navigation, advertising, and other non-relevant elements. As
|
5
|
+
such, the results are far from perfect but will hopefully improve as more sophisticated features are added.
|
6
|
+
|
7
|
+
### INSTALL
|
8
|
+
```
|
9
|
+
sudo gem install omnivore
|
10
|
+
```
|
11
|
+
|
12
|
+
### EXAMPLE
|
13
|
+
```ruby
|
14
|
+
require 'omnivore'
|
15
|
+
document = Omnivore::Document.from_url('http://www.slashgear.com/sennheiser-hd-700-hands-on-10208572')
|
16
|
+
puts document.to_text
|
17
|
+
```
|
18
|
+
|
data/lib/omnivore/document.rb
CHANGED
@@ -3,17 +3,29 @@ require "omnivore/http_client"
|
|
3
3
|
|
4
4
|
module Omnivore
|
5
5
|
|
6
|
+
# A class encapsulating an HTML document.
|
6
7
|
class Document
|
7
8
|
attr_reader :model
|
8
|
-
|
9
|
+
|
10
|
+
# The HTML tags signaling the start of a block or paragraph.
|
11
|
+
BLOCK_TAGS = %w[div p frame]
|
12
|
+
|
13
|
+
# A Struct descibing a paragraph, including it's :path in the document, :text,
|
14
|
+
# and various metrics, such as :text_density.
|
9
15
|
Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
|
10
16
|
|
11
17
|
|
18
|
+
# Creates a Omnivore::Document object from a url.
|
19
|
+
# @param [String] url the document's url
|
20
|
+
# @return [Document] A new Document object.
|
12
21
|
def self.from_url(url)
|
13
22
|
Document.new(HttpClient.get(url))
|
14
23
|
end
|
15
24
|
|
16
25
|
|
26
|
+
# Creates a Omnivore::Document object from a string containing HTML.
|
27
|
+
# @param [String] html the HTML content
|
28
|
+
# @return [Document] A new Document object.
|
17
29
|
def self.from_html(html)
|
18
30
|
Document.new(html)
|
19
31
|
end
|
@@ -26,16 +38,22 @@ module Omnivore
|
|
26
38
|
end
|
27
39
|
|
28
40
|
|
41
|
+
# A HTML representation of the document.
|
42
|
+
# @return [String] A HTML representation of the document.
|
29
43
|
def to_html
|
30
44
|
self.model.to_html
|
31
45
|
end
|
32
46
|
|
33
47
|
|
48
|
+
# Extracts the document title.
|
49
|
+
# @return [String] The document title.
|
34
50
|
def title
|
35
51
|
@title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
|
36
52
|
end
|
37
53
|
|
38
54
|
|
55
|
+
# Extracts document metadata.
|
56
|
+
# @return [Hash] The metadata tags found in the document.
|
39
57
|
def metadata
|
40
58
|
@metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
|
41
59
|
memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
|
@@ -44,12 +62,19 @@ module Omnivore
|
|
44
62
|
end
|
45
63
|
|
46
64
|
|
65
|
+
# Returns the actual content of the document, without navigation, advertising, etc.
|
66
|
+
# @return [String] The document's main content.
|
47
67
|
def to_text
|
48
|
-
|
49
|
-
|
68
|
+
self.to_paragraphs.inject([ ]) { |buffer, p|
|
69
|
+
buffer << p.text if p.text_density >= 0.25
|
70
|
+
buffer
|
71
|
+
}.join("\n")
|
50
72
|
end
|
51
73
|
|
52
74
|
|
75
|
+
# Splits the document into paragraphs, assuming that each <div> or <p> tag represents
|
76
|
+
# a paragraph.
|
77
|
+
# @return [Array] An array of Paragraph objects.
|
53
78
|
def to_paragraphs
|
54
79
|
self.model.xpath("//div|//p").map { |block|
|
55
80
|
html = block.to_html.gsub(/\s+/, " ").strip
|
@@ -64,15 +89,20 @@ module Omnivore
|
|
64
89
|
|
65
90
|
private
|
66
91
|
|
67
|
-
|
92
|
+
# A convenience method that recursively iterates over a document node and returns
|
93
|
+
# an array of all of it's children, with the exception of other block elements
|
94
|
+
# (e.g div or p nodes).
|
95
|
+
# @param [Nokogiri::XML::Node] node the root node
|
96
|
+
# @return [Array] The Nokogiri::XML::Node objects contained in the root.
|
97
|
+
def flatten(node)
|
68
98
|
elements = [ ]
|
69
|
-
return elements if
|
70
|
-
return elements if
|
71
|
-
return elements if
|
72
|
-
if
|
73
|
-
elements <<
|
99
|
+
return elements if node.nil?
|
100
|
+
return elements if node.respond_to?('cdata?') and node.cdata?
|
101
|
+
return elements if node.respond_to?('comment?') and node.comment?
|
102
|
+
if node.children.empty?
|
103
|
+
elements << node
|
74
104
|
else
|
75
|
-
|
105
|
+
node.children.each { |child|
|
76
106
|
unless BLOCK_TAGS.include?(child.name)
|
77
107
|
elements += flatten(child)
|
78
108
|
end
|
data/lib/omnivore/http_client.rb
CHANGED
@@ -2,16 +2,22 @@ require 'net/http'
|
|
2
2
|
require 'uri'
|
3
3
|
|
4
4
|
module Omnivore
|
5
|
+
# A simple HTTP client with a redirect feature.
|
6
|
+
#
|
5
7
|
class HttpClient
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
# Sends a `GET` request to the specified url, following the provided number of
|
10
|
+
# maximum redirects.
|
11
|
+
#
|
12
|
+
# @param [String] url the url to be requested
|
13
|
+
# @param [Integer] redirects the number of redirects to follow
|
14
|
+
# @return [String] the response body of the request.
|
15
|
+
def self.get(url, redirects=3)
|
16
|
+
raise ArgumentError, 'HTTP redirect too deep' if redirects == 0
|
11
17
|
response = Net::HTTP.get_response(URI.parse(url))
|
12
18
|
case response
|
13
19
|
when Net::HTTPSuccess then response.body
|
14
|
-
when Net::HTTPRedirection then HttpClient.get(response['location'],
|
20
|
+
when Net::HTTPRedirection then HttpClient.get(response['location'], redirects - 1)
|
15
21
|
else
|
16
22
|
response.error!
|
17
23
|
end
|
data/lib/omnivore/version.rb
CHANGED
data/omnivore.gemspec
CHANGED
@@ -5,10 +5,11 @@ require "omnivore/version"
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = "omnivore"
|
7
7
|
s.version = Omnivore::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
8
9
|
s.authors = ["Matthias Eder"]
|
9
10
|
s.email = ["matthias@izume.com"]
|
10
|
-
s.homepage = ""
|
11
|
-
s.summary = %q{Content
|
11
|
+
s.homepage = "http://github.com/matthiase/omnivore"
|
12
|
+
s.summary = %q{Content Extraction and Analysis Library}
|
12
13
|
s.description = %q{A library for extracting content from HTML documents.}
|
13
14
|
|
14
15
|
s.rubyforge_project = "omnivore"
|
@@ -19,6 +20,8 @@ Gem::Specification.new do |s|
|
|
19
20
|
s.require_paths = ["lib"]
|
20
21
|
|
21
22
|
# specify any dependencies here; for example:
|
23
|
+
s.add_development_dependency "yard", "~> 0.7.4"
|
24
|
+
s.add_development_dependency "redcarpet", "~> 2.0.1"
|
22
25
|
s.add_development_dependency "rspec", "~> 2.8.0"
|
23
26
|
s.add_runtime_dependency "nokogiri", "~> 1.5.0"
|
24
27
|
end
|
metadata
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omnivore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
6
11
|
platform: ruby
|
7
12
|
authors:
|
8
13
|
- Matthias Eder
|
@@ -14,27 +19,69 @@ date: 2012-01-11 00:00:00 -07:00
|
|
14
19
|
default_executable:
|
15
20
|
dependencies:
|
16
21
|
- !ruby/object:Gem::Dependency
|
17
|
-
name:
|
22
|
+
name: yard
|
18
23
|
prerelease: false
|
19
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
20
25
|
none: false
|
21
26
|
requirements:
|
22
27
|
- - ~>
|
23
28
|
- !ruby/object:Gem::Version
|
24
|
-
|
29
|
+
hash: 11
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 7
|
33
|
+
- 4
|
34
|
+
version: 0.7.4
|
25
35
|
type: :development
|
26
36
|
version_requirements: *id001
|
27
37
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
38
|
+
name: redcarpet
|
29
39
|
prerelease: false
|
30
40
|
requirement: &id002 !ruby/object:Gem::Requirement
|
31
41
|
none: false
|
32
42
|
requirements:
|
33
43
|
- - ~>
|
34
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 13
|
46
|
+
segments:
|
47
|
+
- 2
|
48
|
+
- 0
|
49
|
+
- 1
|
50
|
+
version: 2.0.1
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: rspec
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 47
|
62
|
+
segments:
|
63
|
+
- 2
|
64
|
+
- 8
|
65
|
+
- 0
|
66
|
+
version: 2.8.0
|
67
|
+
type: :development
|
68
|
+
version_requirements: *id003
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
prerelease: false
|
72
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
hash: 3
|
78
|
+
segments:
|
79
|
+
- 1
|
80
|
+
- 5
|
81
|
+
- 0
|
35
82
|
version: 1.5.0
|
36
83
|
type: :runtime
|
37
|
-
version_requirements: *
|
84
|
+
version_requirements: *id004
|
38
85
|
description: A library for extracting content from HTML documents.
|
39
86
|
email:
|
40
87
|
- matthias@izume.com
|
@@ -46,7 +93,6 @@ extra_rdoc_files: []
|
|
46
93
|
|
47
94
|
files:
|
48
95
|
- .gitignore
|
49
|
-
- .rvmrc
|
50
96
|
- Gemfile
|
51
97
|
- README.md
|
52
98
|
- Rakefile
|
@@ -59,7 +105,7 @@ files:
|
|
59
105
|
- spec/fixtures/thia-breen-interview
|
60
106
|
- spec/http_client_spec.rb
|
61
107
|
has_rdoc: true
|
62
|
-
homepage:
|
108
|
+
homepage: http://github.com/matthiase/omnivore
|
63
109
|
licenses: []
|
64
110
|
|
65
111
|
post_install_message:
|
@@ -72,19 +118,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
72
118
|
requirements:
|
73
119
|
- - ">="
|
74
120
|
- !ruby/object:Gem::Version
|
121
|
+
hash: 3
|
122
|
+
segments:
|
123
|
+
- 0
|
75
124
|
version: "0"
|
76
125
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
126
|
none: false
|
78
127
|
requirements:
|
79
128
|
- - ">="
|
80
129
|
- !ruby/object:Gem::Version
|
130
|
+
hash: 3
|
131
|
+
segments:
|
132
|
+
- 0
|
81
133
|
version: "0"
|
82
134
|
requirements: []
|
83
135
|
|
84
136
|
rubyforge_project: omnivore
|
85
|
-
rubygems_version: 1.
|
137
|
+
rubygems_version: 1.3.7
|
86
138
|
signing_key:
|
87
139
|
specification_version: 3
|
88
|
-
summary: Content
|
140
|
+
summary: Content Extraction and Analysis Library
|
89
141
|
test_files: []
|
90
142
|
|
data/.rvmrc
DELETED