omnivore 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,4 +1,7 @@
1
+ .rvmrc
2
+ .yardoc
1
3
  *.gem
2
4
  .bundle
3
5
  Gemfile.lock
4
6
  pkg/*
7
+ doc/*
data/README.md CHANGED
@@ -1 +1,18 @@
1
- Nothing to see here, move along.
1
+ ## Omnivore: a library for decrufting HTML documents
2
+
3
+ Omnivore is a library for extracting "real" content from HTML documents. Currently, the approach is limited to
4
+ analysing text density to distiguish relevant sections from navigation, advertising, and other non-relevant elements. As
5
+ such, the results are far from perfect but will hopefully improve as more sophisticated features are added.
6
+
7
+ ### INSTALL
8
+ ```
9
+ sudo gem install omnivore
10
+ ```
11
+
12
+ ### EXAMPLE
13
+ ```ruby
14
+ require 'omnivore'
15
+ document = Omnivore::Document.from_url('http://www.slashgear.com/sennheiser-hd-700-hands-on-10208572')
16
+ puts document.to_text
17
+ ```
18
+
@@ -3,17 +3,29 @@ require "omnivore/http_client"
3
3
 
4
4
  module Omnivore
5
5
 
6
+ # A class encapsulating an HTML document.
6
7
  class Document
7
8
  attr_reader :model
8
- BLOCK_TAGS = %w[div p frame bod]
9
+
10
+ # The HTML tags signaling the start of a block or paragraph.
11
+ BLOCK_TAGS = %w[div p frame]
12
+
13
+ # A Struct descibing a paragraph, including it's :path in the document, :text,
14
+ # and various metrics, such as :text_density.
9
15
  Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
10
16
 
11
17
 
18
+ # Creates a Omnivore::Document object from a url.
19
+ # @param [String] url the document's url
20
+ # @return [Document] A new Document object.
12
21
  def self.from_url(url)
13
22
  Document.new(HttpClient.get(url))
14
23
  end
15
24
 
16
25
 
26
+ # Creates a Omnivore::Document object from a string containing HTML.
27
+ # @param [String] html the HTML content
28
+ # @return [Document] A new Document object.
17
29
  def self.from_html(html)
18
30
  Document.new(html)
19
31
  end
@@ -26,16 +38,22 @@ module Omnivore
26
38
  end
27
39
 
28
40
 
41
+ # A HTML representation of the document.
42
+ # @return [String] A HTML representation of the document.
29
43
  def to_html
30
44
  self.model.to_html
31
45
  end
32
46
 
33
47
 
48
+ # Extracts the document title.
49
+ # @return [String] The document title.
34
50
  def title
35
51
  @title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
36
52
  end
37
53
 
38
54
 
55
+ # Extracts document metadata.
56
+ # @return [Hash] The metadata tags found in the document.
39
57
  def metadata
40
58
  @metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
41
59
  memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
@@ -44,12 +62,19 @@ module Omnivore
44
62
  end
45
63
 
46
64
 
65
+ # Returns the actual content of the document, without navigation, advertising, etc.
66
+ # @return [String] The document's main content.
47
67
  def to_text
48
- paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
49
- paragraphs.map { |p| p.text }.join("\n")
68
+ self.to_paragraphs.inject([ ]) { |buffer, p|
69
+ buffer << p.text if p.text_density >= 0.25
70
+ buffer
71
+ }.join("\n")
50
72
  end
51
73
 
52
74
 
75
+ # Splits the document into paragraphs, assuming that each <div> or <p> tag represents
76
+ # a paragraph.
77
+ # @return [Array] An array of Paragraph objects.
53
78
  def to_paragraphs
54
79
  self.model.xpath("//div|//p").map { |block|
55
80
  html = block.to_html.gsub(/\s+/, " ").strip
@@ -64,15 +89,20 @@ module Omnivore
64
89
 
65
90
  private
66
91
 
67
- def flatten(block)
92
+ # A convenience method that recursively iterates over a document node and returns
93
+ # an array of all of it's children, with the exception of other block elements
94
+ # (e.g div or p nodes).
95
+ # @param [Nokogiri::XML::Node] node the root node
96
+ # @return [Array] The Nokogiri::XML::Node objects contained in the root.
97
+ def flatten(node)
68
98
  elements = [ ]
69
- return elements if block.nil?
70
- return elements if block.respond_to?('cdata?') and block.cdata?
71
- return elements if block.respond_to?('comment?') and block.comment?
72
- if block.children.empty?
73
- elements << block
99
+ return elements if node.nil?
100
+ return elements if node.respond_to?('cdata?') and node.cdata?
101
+ return elements if node.respond_to?('comment?') and node.comment?
102
+ if node.children.empty?
103
+ elements << node
74
104
  else
75
- block.children.each { |child|
105
+ node.children.each { |child|
76
106
  unless BLOCK_TAGS.include?(child.name)
77
107
  elements += flatten(child)
78
108
  end
@@ -2,16 +2,22 @@ require 'net/http'
2
2
  require 'uri'
3
3
 
4
4
  module Omnivore
5
+ # A simple HTTP client with a redirect feature.
6
+ #
5
7
  class HttpClient
6
8
 
7
-
8
- def self.get(url, attempts=3)
9
- raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
10
-
9
+ # Sends a `GET` request to the specified url, following the provided number of
10
+ # maximum redirects.
11
+ #
12
+ # @param [String] url the url to be requested
13
+ # @param [Integer] redirects the number of redirects to follow
14
+ # @return [String] the response body of the request.
15
+ def self.get(url, redirects=3)
16
+ raise ArgumentError, 'HTTP redirect too deep' if redirects == 0
11
17
  response = Net::HTTP.get_response(URI.parse(url))
12
18
  case response
13
19
  when Net::HTTPSuccess then response.body
14
- when Net::HTTPRedirection then HttpClient.get(response['location'], attempts - 1)
20
+ when Net::HTTPRedirection then HttpClient.get(response['location'], redirects - 1)
15
21
  else
16
22
  response.error!
17
23
  end
@@ -1,3 +1,3 @@
1
1
  module Omnivore
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.0"
3
3
  end
data/omnivore.gemspec CHANGED
@@ -5,10 +5,11 @@ require "omnivore/version"
5
5
  Gem::Specification.new do |s|
6
6
  s.name = "omnivore"
7
7
  s.version = Omnivore::VERSION
8
+ s.platform = Gem::Platform::RUBY
8
9
  s.authors = ["Matthias Eder"]
9
10
  s.email = ["matthias@izume.com"]
10
- s.homepage = ""
11
- s.summary = %q{Content extraction and analysis}
11
+ s.homepage = "http://github.com/matthiase/omnivore"
12
+ s.summary = %q{Content Extraction and Analysis Library}
12
13
  s.description = %q{A library for extracting content from HTML documents.}
13
14
 
14
15
  s.rubyforge_project = "omnivore"
@@ -19,6 +20,8 @@ Gem::Specification.new do |s|
19
20
  s.require_paths = ["lib"]
20
21
 
21
22
  # specify any dependencies here; for example:
23
+ s.add_development_dependency "yard", "~> 0.7.4"
24
+ s.add_development_dependency "redcarpet", "~> 2.0.1"
22
25
  s.add_development_dependency "rspec", "~> 2.8.0"
23
26
  s.add_runtime_dependency "nokogiri", "~> 1.5.0"
24
27
  end
metadata CHANGED
@@ -1,8 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnivore
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.0.4
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
6
11
  platform: ruby
7
12
  authors:
8
13
  - Matthias Eder
@@ -14,27 +19,69 @@ date: 2012-01-11 00:00:00 -07:00
14
19
  default_executable:
15
20
  dependencies:
16
21
  - !ruby/object:Gem::Dependency
17
- name: rspec
22
+ name: yard
18
23
  prerelease: false
19
24
  requirement: &id001 !ruby/object:Gem::Requirement
20
25
  none: false
21
26
  requirements:
22
27
  - - ~>
23
28
  - !ruby/object:Gem::Version
24
- version: 2.8.0
29
+ hash: 11
30
+ segments:
31
+ - 0
32
+ - 7
33
+ - 4
34
+ version: 0.7.4
25
35
  type: :development
26
36
  version_requirements: *id001
27
37
  - !ruby/object:Gem::Dependency
28
- name: nokogiri
38
+ name: redcarpet
29
39
  prerelease: false
30
40
  requirement: &id002 !ruby/object:Gem::Requirement
31
41
  none: false
32
42
  requirements:
33
43
  - - ~>
34
44
  - !ruby/object:Gem::Version
45
+ hash: 13
46
+ segments:
47
+ - 2
48
+ - 0
49
+ - 1
50
+ version: 2.0.1
51
+ type: :development
52
+ version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: rspec
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ hash: 47
62
+ segments:
63
+ - 2
64
+ - 8
65
+ - 0
66
+ version: 2.8.0
67
+ type: :development
68
+ version_requirements: *id003
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ prerelease: false
72
+ requirement: &id004 !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ hash: 3
78
+ segments:
79
+ - 1
80
+ - 5
81
+ - 0
35
82
  version: 1.5.0
36
83
  type: :runtime
37
- version_requirements: *id002
84
+ version_requirements: *id004
38
85
  description: A library for extracting content from HTML documents.
39
86
  email:
40
87
  - matthias@izume.com
@@ -46,7 +93,6 @@ extra_rdoc_files: []
46
93
 
47
94
  files:
48
95
  - .gitignore
49
- - .rvmrc
50
96
  - Gemfile
51
97
  - README.md
52
98
  - Rakefile
@@ -59,7 +105,7 @@ files:
59
105
  - spec/fixtures/thia-breen-interview
60
106
  - spec/http_client_spec.rb
61
107
  has_rdoc: true
62
- homepage: ""
108
+ homepage: http://github.com/matthiase/omnivore
63
109
  licenses: []
64
110
 
65
111
  post_install_message:
@@ -72,19 +118,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
72
118
  requirements:
73
119
  - - ">="
74
120
  - !ruby/object:Gem::Version
121
+ hash: 3
122
+ segments:
123
+ - 0
75
124
  version: "0"
76
125
  required_rubygems_version: !ruby/object:Gem::Requirement
77
126
  none: false
78
127
  requirements:
79
128
  - - ">="
80
129
  - !ruby/object:Gem::Version
130
+ hash: 3
131
+ segments:
132
+ - 0
81
133
  version: "0"
82
134
  requirements: []
83
135
 
84
136
  rubyforge_project: omnivore
85
- rubygems_version: 1.5.0
137
+ rubygems_version: 1.3.7
86
138
  signing_key:
87
139
  specification_version: 3
88
- summary: Content extraction and analysis
140
+ summary: Content Extraction and Analysis Library
89
141
  test_files: []
90
142
 
data/.rvmrc DELETED
@@ -1,7 +0,0 @@
1
-
2
- if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
3
- && -s "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore" ]] ; then
4
- \. "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore"
5
- else
6
- rvm --create "ruby-1.9.2-p0@omnivore"
7
- fi