omnivore 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,4 +1,7 @@
1
+ .rvmrc
2
+ .yardoc
1
3
  *.gem
2
4
  .bundle
3
5
  Gemfile.lock
4
6
  pkg/*
7
+ doc/*
data/README.md CHANGED
@@ -1 +1,18 @@
1
- Nothing to see here, move along.
1
+ ## Omnivore: a library for decrufting HTML documents
2
+
3
+ Omnivore is a library for extracting "real" content from HTML documents. Currently, the approach is limited to
4
+ analysing text density to distiguish relevant sections from navigation, advertising, and other non-relevant elements. As
5
+ such, the results are far from perfect but will hopefully improve as more sophisticated features are added.
6
+
7
+ ### INSTALL
8
+ ```
9
+ sudo gem install omnivore
10
+ ```
11
+
12
+ ### EXAMPLE
13
+ ```ruby
14
+ require 'omnivore'
15
+ document = Omnivore::Document.from_url('http://www.slashgear.com/sennheiser-hd-700-hands-on-10208572')
16
+ puts document.to_text
17
+ ```
18
+
@@ -3,17 +3,29 @@ require "omnivore/http_client"
3
3
 
4
4
  module Omnivore
5
5
 
6
+ # A class encapsulating an HTML document.
6
7
  class Document
7
8
  attr_reader :model
8
- BLOCK_TAGS = %w[div p frame bod]
9
+
10
+ # The HTML tags signaling the start of a block or paragraph.
11
+ BLOCK_TAGS = %w[div p frame]
12
+
13
+ # A Struct descibing a paragraph, including it's :path in the document, :text,
14
+ # and various metrics, such as :text_density.
9
15
  Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
10
16
 
11
17
 
18
+ # Creates a Omnivore::Document object from a url.
19
+ # @param [String] url the document's url
20
+ # @return [Document] A new Document object.
12
21
  def self.from_url(url)
13
22
  Document.new(HttpClient.get(url))
14
23
  end
15
24
 
16
25
 
26
+ # Creates a Omnivore::Document object from a string containing HTML.
27
+ # @param [String] html the HTML content
28
+ # @return [Document] A new Document object.
17
29
  def self.from_html(html)
18
30
  Document.new(html)
19
31
  end
@@ -26,16 +38,22 @@ module Omnivore
26
38
  end
27
39
 
28
40
 
41
+ # A HTML representation of the document.
42
+ # @return [String] A HTML representation of the document.
29
43
  def to_html
30
44
  self.model.to_html
31
45
  end
32
46
 
33
47
 
48
+ # Extracts the document title.
49
+ # @return [String] The document title.
34
50
  def title
35
51
  @title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
36
52
  end
37
53
 
38
54
 
55
+ # Extracts document metadata.
56
+ # @return [Hash] The metadata tags found in the document.
39
57
  def metadata
40
58
  @metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
41
59
  memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
@@ -44,12 +62,19 @@ module Omnivore
44
62
  end
45
63
 
46
64
 
65
+ # Returns the actual content of the document, without navigation, advertising, etc.
66
+ # @return [String] The document's main content.
47
67
  def to_text
48
- paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
49
- paragraphs.map { |p| p.text }.join("\n")
68
+ self.to_paragraphs.inject([ ]) { |buffer, p|
69
+ buffer << p.text if p.text_density >= 0.25
70
+ buffer
71
+ }.join("\n")
50
72
  end
51
73
 
52
74
 
75
+ # Splits the document into paragraphs, assuming that each <div> or <p> tag represents
76
+ # a paragraph.
77
+ # @return [Array] An array of Paragraph objects.
53
78
  def to_paragraphs
54
79
  self.model.xpath("//div|//p").map { |block|
55
80
  html = block.to_html.gsub(/\s+/, " ").strip
@@ -64,15 +89,20 @@ module Omnivore
64
89
 
65
90
  private
66
91
 
67
- def flatten(block)
92
+ # A convenience method that recursively iterates over a document node and returns
93
+ # an array of all of it's children, with the exception of other block elements
94
+ # (e.g div or p nodes).
95
+ # @param [Nokogiri::XML::Node] node the root node
96
+ # @return [Array] The Nokogiri::XML::Node objects contained in the root.
97
+ def flatten(node)
68
98
  elements = [ ]
69
- return elements if block.nil?
70
- return elements if block.respond_to?('cdata?') and block.cdata?
71
- return elements if block.respond_to?('comment?') and block.comment?
72
- if block.children.empty?
73
- elements << block
99
+ return elements if node.nil?
100
+ return elements if node.respond_to?('cdata?') and node.cdata?
101
+ return elements if node.respond_to?('comment?') and node.comment?
102
+ if node.children.empty?
103
+ elements << node
74
104
  else
75
- block.children.each { |child|
105
+ node.children.each { |child|
76
106
  unless BLOCK_TAGS.include?(child.name)
77
107
  elements += flatten(child)
78
108
  end
@@ -2,16 +2,22 @@ require 'net/http'
2
2
  require 'uri'
3
3
 
4
4
  module Omnivore
5
+ # A simple HTTP client with a redirect feature.
6
+ #
5
7
  class HttpClient
6
8
 
7
-
8
- def self.get(url, attempts=3)
9
- raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
10
-
9
+ # Sends a `GET` request to the specified url, following the provided number of
10
+ # maximum redirects.
11
+ #
12
+ # @param [String] url the url to be requested
13
+ # @param [Integer] redirects the number of redirects to follow
14
+ # @return [String] the response body of the request.
15
+ def self.get(url, redirects=3)
16
+ raise ArgumentError, 'HTTP redirect too deep' if redirects == 0
11
17
  response = Net::HTTP.get_response(URI.parse(url))
12
18
  case response
13
19
  when Net::HTTPSuccess then response.body
14
- when Net::HTTPRedirection then HttpClient.get(response['location'], attempts - 1)
20
+ when Net::HTTPRedirection then HttpClient.get(response['location'], redirects - 1)
15
21
  else
16
22
  response.error!
17
23
  end
@@ -1,3 +1,3 @@
1
1
  module Omnivore
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.0"
3
3
  end
data/omnivore.gemspec CHANGED
@@ -5,10 +5,11 @@ require "omnivore/version"
5
5
  Gem::Specification.new do |s|
6
6
  s.name = "omnivore"
7
7
  s.version = Omnivore::VERSION
8
+ s.platform = Gem::Platform::RUBY
8
9
  s.authors = ["Matthias Eder"]
9
10
  s.email = ["matthias@izume.com"]
10
- s.homepage = ""
11
- s.summary = %q{Content extraction and analysis}
11
+ s.homepage = "http://github.com/matthiase/omnivore"
12
+ s.summary = %q{Content Extraction and Analysis Library}
12
13
  s.description = %q{A library for extracting content from HTML documents.}
13
14
 
14
15
  s.rubyforge_project = "omnivore"
@@ -19,6 +20,8 @@ Gem::Specification.new do |s|
19
20
  s.require_paths = ["lib"]
20
21
 
21
22
  # specify any dependencies here; for example:
23
+ s.add_development_dependency "yard", "~> 0.7.4"
24
+ s.add_development_dependency "redcarpet", "~> 2.0.1"
22
25
  s.add_development_dependency "rspec", "~> 2.8.0"
23
26
  s.add_runtime_dependency "nokogiri", "~> 1.5.0"
24
27
  end
metadata CHANGED
@@ -1,8 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnivore
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.0.4
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
6
11
  platform: ruby
7
12
  authors:
8
13
  - Matthias Eder
@@ -14,27 +19,69 @@ date: 2012-01-11 00:00:00 -07:00
14
19
  default_executable:
15
20
  dependencies:
16
21
  - !ruby/object:Gem::Dependency
17
- name: rspec
22
+ name: yard
18
23
  prerelease: false
19
24
  requirement: &id001 !ruby/object:Gem::Requirement
20
25
  none: false
21
26
  requirements:
22
27
  - - ~>
23
28
  - !ruby/object:Gem::Version
24
- version: 2.8.0
29
+ hash: 11
30
+ segments:
31
+ - 0
32
+ - 7
33
+ - 4
34
+ version: 0.7.4
25
35
  type: :development
26
36
  version_requirements: *id001
27
37
  - !ruby/object:Gem::Dependency
28
- name: nokogiri
38
+ name: redcarpet
29
39
  prerelease: false
30
40
  requirement: &id002 !ruby/object:Gem::Requirement
31
41
  none: false
32
42
  requirements:
33
43
  - - ~>
34
44
  - !ruby/object:Gem::Version
45
+ hash: 13
46
+ segments:
47
+ - 2
48
+ - 0
49
+ - 1
50
+ version: 2.0.1
51
+ type: :development
52
+ version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: rspec
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ hash: 47
62
+ segments:
63
+ - 2
64
+ - 8
65
+ - 0
66
+ version: 2.8.0
67
+ type: :development
68
+ version_requirements: *id003
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ prerelease: false
72
+ requirement: &id004 !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ hash: 3
78
+ segments:
79
+ - 1
80
+ - 5
81
+ - 0
35
82
  version: 1.5.0
36
83
  type: :runtime
37
- version_requirements: *id002
84
+ version_requirements: *id004
38
85
  description: A library for extracting content from HTML documents.
39
86
  email:
40
87
  - matthias@izume.com
@@ -46,7 +93,6 @@ extra_rdoc_files: []
46
93
 
47
94
  files:
48
95
  - .gitignore
49
- - .rvmrc
50
96
  - Gemfile
51
97
  - README.md
52
98
  - Rakefile
@@ -59,7 +105,7 @@ files:
59
105
  - spec/fixtures/thia-breen-interview
60
106
  - spec/http_client_spec.rb
61
107
  has_rdoc: true
62
- homepage: ""
108
+ homepage: http://github.com/matthiase/omnivore
63
109
  licenses: []
64
110
 
65
111
  post_install_message:
@@ -72,19 +118,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
72
118
  requirements:
73
119
  - - ">="
74
120
  - !ruby/object:Gem::Version
121
+ hash: 3
122
+ segments:
123
+ - 0
75
124
  version: "0"
76
125
  required_rubygems_version: !ruby/object:Gem::Requirement
77
126
  none: false
78
127
  requirements:
79
128
  - - ">="
80
129
  - !ruby/object:Gem::Version
130
+ hash: 3
131
+ segments:
132
+ - 0
81
133
  version: "0"
82
134
  requirements: []
83
135
 
84
136
  rubyforge_project: omnivore
85
- rubygems_version: 1.5.0
137
+ rubygems_version: 1.3.7
86
138
  signing_key:
87
139
  specification_version: 3
88
- summary: Content extraction and analysis
140
+ summary: Content Extraction and Analysis Library
89
141
  test_files: []
90
142
 
data/.rvmrc DELETED
@@ -1,7 +0,0 @@
1
-
2
- if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
3
- && -s "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore" ]] ; then
4
- \. "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore"
5
- else
6
- rvm --create "ruby-1.9.2-p0@omnivore"
7
- fi