scrappy 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +19 -0
- data/README.rdoc +176 -0
- data/Rakefile +20 -0
- data/bin/scrappy +228 -0
- data/kb/elmundo.yarf +92 -0
- data/lib/scrappy.rb +22 -0
- data/lib/scrappy/agent/agent.rb +90 -0
- data/lib/scrappy/agent/blind_agent.rb +34 -0
- data/lib/scrappy/agent/cluster.rb +35 -0
- data/lib/scrappy/agent/extractor.rb +159 -0
- data/lib/scrappy/agent/visual_agent.rb +72 -0
- data/lib/scrappy/proxy.rb +41 -0
- data/lib/scrappy/server.rb +77 -0
- data/lib/scrappy/shell.rb +70 -0
- data/lib/scrappy/support.rb +18 -0
- data/lib/scrappy/webkit/webkit.rb +18 -0
- data/test/test_helper.rb +3 -0
- data/test/test_scrappy.rb +11 -0
- metadata +233 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'camping'
|
2
|
+
require 'camping/session'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
Camping.goes :Scrappy
|
6
|
+
|
7
|
+
module Scrappy
|
8
|
+
module Controllers
|
9
|
+
class Index < R '.*'
|
10
|
+
include InputEscaping
|
11
|
+
|
12
|
+
def get
|
13
|
+
process_request :get
|
14
|
+
end
|
15
|
+
|
16
|
+
def post
|
17
|
+
process_request :post
|
18
|
+
end
|
19
|
+
|
20
|
+
protected
|
21
|
+
def process_request http_method
|
22
|
+
agent.proxy http_method, request.env["REQUEST_URI"], @input
|
23
|
+
|
24
|
+
case agent.status
|
25
|
+
when :redirect
|
26
|
+
redirect agent.uri
|
27
|
+
when :ok
|
28
|
+
@headers['Content-Type'] = agent.content_type
|
29
|
+
agent.output
|
30
|
+
else
|
31
|
+
@status = 500
|
32
|
+
'Error'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def agent
|
37
|
+
Scrappy::Agent[@request.env["REMOTE_ADDR"]]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'camping'
|
2
|
+
require 'camping/session'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
Camping.goes :Scrappy
|
6
|
+
|
7
|
+
module Scrappy
|
8
|
+
include Camping::Session
|
9
|
+
secret '1a36591bceec49c832079e270d7e8b73'
|
10
|
+
|
11
|
+
module Controllers
|
12
|
+
class Index
|
13
|
+
def get
|
14
|
+
mab do
|
15
|
+
html do
|
16
|
+
head {}
|
17
|
+
body do
|
18
|
+
h1 "Scrappy Web Server"
|
19
|
+
p "Use following URL format: http://[host]/[format]/[url]"
|
20
|
+
p do
|
21
|
+
"For example: " + a("http://localhost:3434/rdfxml/http://www.google.com",
|
22
|
+
:href=>"http://localhost:3434/rdfxml/http://www.google.com")
|
23
|
+
end
|
24
|
+
p do
|
25
|
+
"Remember to escape parameters: " +
|
26
|
+
"http://www.example.com/~user/%3Ftest%3D1%26test1%3D2<br/> or<br/> " +
|
27
|
+
"http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2<br/>" +
|
28
|
+
"instead of<br/> http://www.example.com/~user/?test=1&test1=2"
|
29
|
+
end
|
30
|
+
p do
|
31
|
+
"Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class Extract < R '/(\w+)/(.+)'
|
40
|
+
include InputEscaping
|
41
|
+
|
42
|
+
def get format, url
|
43
|
+
process_request :get, format, url
|
44
|
+
end
|
45
|
+
|
46
|
+
def post format, url
|
47
|
+
process_request :post, format, url
|
48
|
+
end
|
49
|
+
|
50
|
+
protected
|
51
|
+
def process_request http_method, format, url
|
52
|
+
callback = @input['callback']
|
53
|
+
agent.proxy http_method, url, @input.reject{|k,v| k=='callback'}, format.to_sym
|
54
|
+
|
55
|
+
case agent.status
|
56
|
+
when :redirect
|
57
|
+
redirect "/#{format}/#{agent.uri}#{inputs}"
|
58
|
+
when :ok
|
59
|
+
@headers['Content-Type'] = agent.content_type
|
60
|
+
callback ? "#{callback}(#{agent.output})" : agent.output
|
61
|
+
else
|
62
|
+
@status = 500
|
63
|
+
'Error'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def agent
|
68
|
+
return @agent if @agent
|
69
|
+
if @state[:agent].nil? || @state[:token] != SESSION_TOKEN
|
70
|
+
@state[:token] = SESSION_TOKEN
|
71
|
+
@state[:agent] = Scrappy::Agent.create.id
|
72
|
+
end
|
73
|
+
@agent = Scrappy::Agent[@state[:agent]]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Scrappy
|
2
|
+
class Shell
|
3
|
+
def initialize file=nil
|
4
|
+
@agent = Agent.create
|
5
|
+
@file = file
|
6
|
+
end
|
7
|
+
|
8
|
+
def run
|
9
|
+
commands = ['get', 'put', 'help']
|
10
|
+
|
11
|
+
Readline.completion_append_character = " "
|
12
|
+
Readline.completer_word_break_characters = ""
|
13
|
+
Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
|
14
|
+
|
15
|
+
if @file
|
16
|
+
open(@file, 'r').lines.each do |line|
|
17
|
+
break if process(line) == :quit
|
18
|
+
end
|
19
|
+
else
|
20
|
+
begin
|
21
|
+
line = Readline.readline(bash, true)
|
22
|
+
code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
|
23
|
+
end while code != :quit
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
protected
|
28
|
+
def process raw_command
|
29
|
+
command = raw_command.strip
|
30
|
+
|
31
|
+
code = if command =~ /\Aget\W(.*)\Z/
|
32
|
+
puts @agent.proxy :get, $1
|
33
|
+
puts ''
|
34
|
+
elsif command == 'help'
|
35
|
+
puts 'Available commands:'
|
36
|
+
puts ' get URL: Visit the specified URL'
|
37
|
+
puts ' help: Show this information'
|
38
|
+
puts ' quit: Exit scrappy shell'
|
39
|
+
puts ''
|
40
|
+
elsif command == 'quit'
|
41
|
+
:quit
|
42
|
+
elsif command == '' or command[0..0] == '#'
|
43
|
+
nil
|
44
|
+
else
|
45
|
+
puts "ERROR: Unknown command '#{command}'"
|
46
|
+
puts ''
|
47
|
+
end
|
48
|
+
code
|
49
|
+
end
|
50
|
+
|
51
|
+
def bash
|
52
|
+
return '' if Options.quiet
|
53
|
+
location = if @agent.uri
|
54
|
+
uri = URI::parse(@agent.uri)
|
55
|
+
path = uri.path.to_s
|
56
|
+
path = path[0..0] + "..." + path[-16..-1] if path.size > 20
|
57
|
+
if uri.query
|
58
|
+
query = "?" + uri.query
|
59
|
+
query = "?..." + query[-10..-1] if query.size > 13
|
60
|
+
else
|
61
|
+
query = ""
|
62
|
+
end
|
63
|
+
"#{uri.base}#{path}#{query}"
|
64
|
+
else
|
65
|
+
''
|
66
|
+
end
|
67
|
+
"#{location}$ "
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'net/http'
|
3
|
+
require 'net/https'
|
4
|
+
|
5
|
+
module URI
|
6
|
+
def base
|
7
|
+
self.to_s.split('/')[0..2] * '/'
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
module Scrappy
|
12
|
+
module InputEscaping
|
13
|
+
def inputs
|
14
|
+
return '' if @input.empty?
|
15
|
+
"?" + (@input.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'gtk2'
|
2
|
+
module Gtk
|
3
|
+
module WebKit
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'rbwebkitgtk.so'
|
8
|
+
|
9
|
+
class Gtk::WebKit::WebView
|
10
|
+
alias :load_html_string_no_defaults :load_html_string
|
11
|
+
def load_html_string(content, base_uri=nil)
|
12
|
+
load_html_string_no_defaults(content, base_uri)
|
13
|
+
end
|
14
|
+
|
15
|
+
def mark_text_matches(test, case_sensitive=false, limit=0)
|
16
|
+
mark_text_matches_with_limit(test, case_sensitive, limit)
|
17
|
+
end
|
18
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scrappy
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
version: "0.1"
|
9
|
+
platform: ruby
|
10
|
+
authors:
|
11
|
+
- Jose Ignacio
|
12
|
+
autorequire:
|
13
|
+
bindir: bin
|
14
|
+
cert_chain: []
|
15
|
+
|
16
|
+
date: 2010-10-07 00:00:00 +02:00
|
17
|
+
default_executable:
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: activesupport
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
segments:
|
27
|
+
- 2
|
28
|
+
- 3
|
29
|
+
- 5
|
30
|
+
version: 2.3.5
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: markaby
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
segments:
|
41
|
+
- 0
|
42
|
+
- 7
|
43
|
+
- 1
|
44
|
+
version: 0.7.1
|
45
|
+
type: :runtime
|
46
|
+
version_requirements: *id002
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: camping
|
49
|
+
prerelease: false
|
50
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 2
|
56
|
+
- 0
|
57
|
+
version: "2.0"
|
58
|
+
type: :runtime
|
59
|
+
version_requirements: *id003
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: nokogiri
|
62
|
+
prerelease: false
|
63
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 1
|
69
|
+
- 4
|
70
|
+
- 1
|
71
|
+
version: 1.4.1
|
72
|
+
type: :runtime
|
73
|
+
version_requirements: *id004
|
74
|
+
- !ruby/object:Gem::Dependency
|
75
|
+
name: mechanize
|
76
|
+
prerelease: false
|
77
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
segments:
|
82
|
+
- 1
|
83
|
+
- 0
|
84
|
+
- 0
|
85
|
+
version: 1.0.0
|
86
|
+
type: :runtime
|
87
|
+
version_requirements: *id005
|
88
|
+
- !ruby/object:Gem::Dependency
|
89
|
+
name: lightrdf
|
90
|
+
prerelease: false
|
91
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
segments:
|
96
|
+
- 0
|
97
|
+
- 1
|
98
|
+
version: "0.1"
|
99
|
+
type: :runtime
|
100
|
+
version_requirements: *id006
|
101
|
+
- !ruby/object:Gem::Dependency
|
102
|
+
name: rubyforge
|
103
|
+
prerelease: false
|
104
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
segments:
|
109
|
+
- 2
|
110
|
+
- 0
|
111
|
+
- 4
|
112
|
+
version: 2.0.4
|
113
|
+
type: :development
|
114
|
+
version_requirements: *id007
|
115
|
+
- !ruby/object:Gem::Dependency
|
116
|
+
name: hoe
|
117
|
+
prerelease: false
|
118
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
segments:
|
123
|
+
- 2
|
124
|
+
- 6
|
125
|
+
- 0
|
126
|
+
version: 2.6.0
|
127
|
+
type: :development
|
128
|
+
version_requirements: *id008
|
129
|
+
description: |-
|
130
|
+
Scrappy is a tool that allows extracting information from web pages and producing RDF data.
|
131
|
+
It uses the scraping ontology to define the mappings between HTML contents and RDF data.
|
132
|
+
|
133
|
+
An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
|
134
|
+
|
135
|
+
dc: http://purl.org/dc/elements/1.1/
|
136
|
+
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
|
137
|
+
sioc: http://rdfs.org/sioc/ns#
|
138
|
+
sc: http://lab.gsi.dit.upm.es/scraping.rdf#
|
139
|
+
*:
|
140
|
+
rdf:type: sc:Fragment
|
141
|
+
sc:selector:
|
142
|
+
*:
|
143
|
+
rdf:type: sc:UriSelector
|
144
|
+
rdf:value: "http://www.elmundo.es/"
|
145
|
+
sc:identifier:
|
146
|
+
*:
|
147
|
+
rdf:type: sc:BaseUriSelector
|
148
|
+
sc:subfragment:
|
149
|
+
*:
|
150
|
+
sc:type: sioc:Post
|
151
|
+
sc:selector:
|
152
|
+
*:
|
153
|
+
rdf:type: sc:CssSelector
|
154
|
+
rdf:value: ".noticia h2, .noticia h3, .noticia h4"
|
155
|
+
sc:identifier:
|
156
|
+
*:
|
157
|
+
rdf:type: sc:CssSelector
|
158
|
+
rdf:value: "a"
|
159
|
+
sc:attribute: "href"
|
160
|
+
sc:subfragment:
|
161
|
+
*:
|
162
|
+
sc:type: rdf:Literal
|
163
|
+
sc:relation: dc:title
|
164
|
+
sc:selector:
|
165
|
+
*:
|
166
|
+
rdf:type: sc:CssSelector
|
167
|
+
rdf:value: "a"
|
168
|
+
|
169
|
+
(The above code is serialized using YARF format, supported by LightRDF gem, as well as
|
170
|
+
RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
|
171
|
+
email:
|
172
|
+
- joseignacio.fernandez@gmail.com
|
173
|
+
executables:
|
174
|
+
- scrappy
|
175
|
+
extensions: []
|
176
|
+
|
177
|
+
extra_rdoc_files:
|
178
|
+
- History.txt
|
179
|
+
- Manifest.txt
|
180
|
+
files:
|
181
|
+
- History.txt
|
182
|
+
- Manifest.txt
|
183
|
+
- README.rdoc
|
184
|
+
- Rakefile
|
185
|
+
- bin/scrappy
|
186
|
+
- kb/elmundo.yarf
|
187
|
+
- lib/scrappy.rb
|
188
|
+
- lib/scrappy/agent/agent.rb
|
189
|
+
- lib/scrappy/agent/blind_agent.rb
|
190
|
+
- lib/scrappy/agent/cluster.rb
|
191
|
+
- lib/scrappy/agent/extractor.rb
|
192
|
+
- lib/scrappy/agent/visual_agent.rb
|
193
|
+
- lib/scrappy/proxy.rb
|
194
|
+
- lib/scrappy/server.rb
|
195
|
+
- lib/scrappy/shell.rb
|
196
|
+
- lib/scrappy/support.rb
|
197
|
+
- lib/scrappy/webkit/webkit.rb
|
198
|
+
- test/test_helper.rb
|
199
|
+
- test/test_scrappy.rb
|
200
|
+
has_rdoc: true
|
201
|
+
homepage: http://github.com/josei/scrappy
|
202
|
+
licenses: []
|
203
|
+
|
204
|
+
post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
|
205
|
+
rdoc_options:
|
206
|
+
- --main
|
207
|
+
- README.rdoc
|
208
|
+
require_paths:
|
209
|
+
- lib
|
210
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
211
|
+
requirements:
|
212
|
+
- - ">="
|
213
|
+
- !ruby/object:Gem::Version
|
214
|
+
segments:
|
215
|
+
- 0
|
216
|
+
version: "0"
|
217
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
218
|
+
requirements:
|
219
|
+
- - ">="
|
220
|
+
- !ruby/object:Gem::Version
|
221
|
+
segments:
|
222
|
+
- 0
|
223
|
+
version: "0"
|
224
|
+
requirements: []
|
225
|
+
|
226
|
+
rubyforge_project: scrappy
|
227
|
+
rubygems_version: 1.3.6
|
228
|
+
signing_key:
|
229
|
+
specification_version: 3
|
230
|
+
summary: Web scraper that allows producing RDF data out of plain web pages
|
231
|
+
test_files:
|
232
|
+
- test/test_scrappy.rb
|
233
|
+
- test/test_helper.rb
|