boilerpipe 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +21 -0
- data/Rakefile +0 -0
- data/boilerpipe.gemspec +20 -0
- data/lib/boilerpipe.rb +26 -0
- metadata +70 -0
data/README.textile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
This gem is a ruby wrapper for the Boilerpipe API.
|
2
|
+
Boilerpipe definition:
|
3
|
+
|
4
|
+
bq. The boilerpipe library provides algorithms to detect and remove the surplus "clutter" (boilerplate, templates) around the main textual content of a web page.
|
5
|
+
|
6
|
+
For more information: http://code.google.com/p/boilerpipe/
|
7
|
+
|
8
|
+
h1. Explication
|
9
|
+
|
10
|
+
The Boilerpipe module has only one method which is extract. Extract takes 2 parameters, first the url and second a hash.
|
11
|
+
The hash can have 3 options:
|
12
|
+
* output => :html, :htmlFragment, :text, :json, :debug
|
13
|
+
* extractor => :ArticleExtractor, :DefaultExtractor, :LargestContentExtractor, :KeepEverythingExtractor, :CanolaExtractor
|
14
|
+
* api: => The api url
|
15
|
+
|
16
|
+
None of these options are mandatory. To find out more about these options checkout the Boilerpipe API http://boilerpipe-web.appspot.com/
|
17
|
+
|
18
|
+
h1. Example
|
19
|
+
|
20
|
+
>> require "boilerpipe"
|
21
|
+
>> Boilerpipe.extract("http://techcrunch.com/2011/05/12/karma-is-a-bitch/", {:output => :json})
|
data/Rakefile
ADDED
File without changes
|
data/boilerpipe.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "boilerpipe"
|
3
|
+
s.version = "0.0.1"
|
4
|
+
s.date = "2010-05-13"
|
5
|
+
s.summary = "Ruby wrapper of the Boilerpipe API"
|
6
|
+
s.email = "g.marcilhacy@gmail.com"
|
7
|
+
s.homepage = "https://github.com/gregorym/boilerpipe"
|
8
|
+
s.description = "Ruby wrapper of the Boilerpipe API"
|
9
|
+
s.has_rdoc = false
|
10
|
+
s.authors = ["Grégory Marcilhacy"]
|
11
|
+
|
12
|
+
s.require_paths = %w[lib]
|
13
|
+
|
14
|
+
s.files = %w[
|
15
|
+
boilerpipe.gemspec
|
16
|
+
README.textile
|
17
|
+
Rakefile
|
18
|
+
lib/boilerpipe.rb
|
19
|
+
]
|
20
|
+
end
|
data/lib/boilerpipe.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
|
3
|
+
class Object
|
4
|
+
def blank?
|
5
|
+
respond_to?(:empty?) ? empty? : !self
|
6
|
+
end
|
7
|
+
|
8
|
+
def present?
|
9
|
+
!blank?
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
module Boilerpipe
|
14
|
+
DEFAULT_API_URL = 'http://boilerpipe-web.appspot.com/extract'
|
15
|
+
EXTRACTORS = [ :ArticleExtractor, :DefaultExtractor, :LargestContentExtractor, :KeepEverythingExtractor, :CanolaExtractor ]
|
16
|
+
OUTPUT_FORMATS = [ :html, :htmlFragment, :text, :json, :debug ]
|
17
|
+
|
18
|
+
def self.extract(extract_url, opts = {})
|
19
|
+
@output = opts[:output].present? ? opts[:output] : OUTPUT_FORMATS.first
|
20
|
+
@extractor = opts[:extractor].present? ? opts[:extractor] : EXTRACTORS.first
|
21
|
+
@api = opts[:api].present? ? opts[:api] : DEFAULT_API_URL
|
22
|
+
|
23
|
+
url = [@api, "?url=#{extract_url}", "&extractor=#{@extractor}","&output=#{@output}"].join
|
24
|
+
open(url).read
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: boilerpipe
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- "Gr\xC3\xA9gory Marcilhacy"
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-05-13 00:00:00 +02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Ruby wrapper of the Boilerpipe API
|
23
|
+
email: g.marcilhacy@gmail.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- boilerpipe.gemspec
|
32
|
+
- README.textile
|
33
|
+
- Rakefile
|
34
|
+
- lib/boilerpipe.rb
|
35
|
+
has_rdoc: true
|
36
|
+
homepage: https://github.com/gregorym/boilerpipe
|
37
|
+
licenses: []
|
38
|
+
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
hash: 3
|
50
|
+
segments:
|
51
|
+
- 0
|
52
|
+
version: "0"
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
hash: 3
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project:
|
65
|
+
rubygems_version: 1.6.2
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: Ruby wrapper of the Boilerpipe API
|
69
|
+
test_files: []
|
70
|
+
|