boilerpipe 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +21 -0
- data/Rakefile +0 -0
- data/boilerpipe.gemspec +20 -0
- data/lib/boilerpipe.rb +26 -0
- metadata +70 -0
data/README.textile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
This gem is a ruby wrapper for the Boilerpipe API.
|
2
|
+
Boilerpipe definition:
|
3
|
+
|
4
|
+
bq. The boilerpipe library provides algorithms to detect and remove the surplus "clutter" (boilerplate, templates) around the main textual content of a web page.
|
5
|
+
|
6
|
+
For more information: http://code.google.com/p/boilerpipe/
|
7
|
+
|
8
|
+
h1. Explication
|
9
|
+
|
10
|
+
The Boilerpipe module has only one method which is extract. Extract takes 2 parameters, first the url and second a hash.
|
11
|
+
The hash can have 3 options:
|
12
|
+
* output => :html, :htmlFragment, :text, :json, :debug
|
13
|
+
* extractor => :ArticleExtractor, :DefaultExtractor, :LargestContentExtractor, :KeepEverythingExtractor, :CanolaExtractor
|
14
|
+
* api: => The api url
|
15
|
+
|
16
|
+
None of these options are mandatory. To find out more about these options checkout the Boilerpipe API http://boilerpipe-web.appspot.com/
|
17
|
+
|
18
|
+
h1. Example
|
19
|
+
|
20
|
+
>> require "boilerpipe"
|
21
|
+
>> Boilerpipe.extract("http://techcrunch.com/2011/05/12/karma-is-a-bitch/", {:output => :json})
|
data/Rakefile
ADDED
File without changes
|
data/boilerpipe.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "boilerpipe"
|
3
|
+
s.version = "0.0.1"
|
4
|
+
s.date = "2010-05-13"
|
5
|
+
s.summary = "Ruby wrapper of the Boilerpipe API"
|
6
|
+
s.email = "g.marcilhacy@gmail.com"
|
7
|
+
s.homepage = "https://github.com/gregorym/boilerpipe"
|
8
|
+
s.description = "Ruby wrapper of the Boilerpipe API"
|
9
|
+
s.has_rdoc = false
|
10
|
+
s.authors = ["Grégory Marcilhacy"]
|
11
|
+
|
12
|
+
s.require_paths = %w[lib]
|
13
|
+
|
14
|
+
s.files = %w[
|
15
|
+
boilerpipe.gemspec
|
16
|
+
README.textile
|
17
|
+
Rakefile
|
18
|
+
lib/boilerpipe.rb
|
19
|
+
]
|
20
|
+
end
|
data/lib/boilerpipe.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
|
3
|
+
class Object
|
4
|
+
def blank?
|
5
|
+
respond_to?(:empty?) ? empty? : !self
|
6
|
+
end
|
7
|
+
|
8
|
+
def present?
|
9
|
+
!blank?
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
module Boilerpipe
|
14
|
+
DEFAULT_API_URL = 'http://boilerpipe-web.appspot.com/extract'
|
15
|
+
EXTRACTORS = [ :ArticleExtractor, :DefaultExtractor, :LargestContentExtractor, :KeepEverythingExtractor, :CanolaExtractor ]
|
16
|
+
OUTPUT_FORMATS = [ :html, :htmlFragment, :text, :json, :debug ]
|
17
|
+
|
18
|
+
def self.extract(extract_url, opts = {})
|
19
|
+
@output = opts[:output].present? ? opts[:output] : OUTPUT_FORMATS.first
|
20
|
+
@extractor = opts[:extractor].present? ? opts[:extractor] : EXTRACTORS.first
|
21
|
+
@api = opts[:api].present? ? opts[:api] : DEFAULT_API_URL
|
22
|
+
|
23
|
+
url = [@api, "?url=#{extract_url}", "&extractor=#{@extractor}","&output=#{@output}"].join
|
24
|
+
open(url).read
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: boilerpipe
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- "Gr\xC3\xA9gory Marcilhacy"
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-05-13 00:00:00 +02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Ruby wrapper of the Boilerpipe API
|
23
|
+
email: g.marcilhacy@gmail.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- boilerpipe.gemspec
|
32
|
+
- README.textile
|
33
|
+
- Rakefile
|
34
|
+
- lib/boilerpipe.rb
|
35
|
+
has_rdoc: true
|
36
|
+
homepage: https://github.com/gregorym/boilerpipe
|
37
|
+
licenses: []
|
38
|
+
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
hash: 3
|
50
|
+
segments:
|
51
|
+
- 0
|
52
|
+
version: "0"
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
hash: 3
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project:
|
65
|
+
rubygems_version: 1.6.2
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: Ruby wrapper of the Boilerpipe API
|
69
|
+
test_files: []
|
70
|
+
|