scrappy 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +19 -0
- data/README.rdoc +176 -0
- data/Rakefile +20 -0
- data/bin/scrappy +228 -0
- data/kb/elmundo.yarf +92 -0
- data/lib/scrappy.rb +22 -0
- data/lib/scrappy/agent/agent.rb +90 -0
- data/lib/scrappy/agent/blind_agent.rb +34 -0
- data/lib/scrappy/agent/cluster.rb +35 -0
- data/lib/scrappy/agent/extractor.rb +159 -0
- data/lib/scrappy/agent/visual_agent.rb +72 -0
- data/lib/scrappy/proxy.rb +41 -0
- data/lib/scrappy/server.rb +77 -0
- data/lib/scrappy/shell.rb +70 -0
- data/lib/scrappy/support.rb +18 -0
- data/lib/scrappy/webkit/webkit.rb +18 -0
- data/test/test_helper.rb +3 -0
- data/test/test_scrappy.rb +11 -0
- metadata +233 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'camping'
|
2
|
+
require 'camping/session'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
Camping.goes :Scrappy
|
6
|
+
|
7
|
+
module Scrappy
|
8
|
+
module Controllers
|
9
|
+
class Index < R '.*'
|
10
|
+
include InputEscaping
|
11
|
+
|
12
|
+
def get
|
13
|
+
process_request :get
|
14
|
+
end
|
15
|
+
|
16
|
+
def post
|
17
|
+
process_request :post
|
18
|
+
end
|
19
|
+
|
20
|
+
protected
|
21
|
+
def process_request http_method
|
22
|
+
agent.proxy http_method, request.env["REQUEST_URI"], @input
|
23
|
+
|
24
|
+
case agent.status
|
25
|
+
when :redirect
|
26
|
+
redirect agent.uri
|
27
|
+
when :ok
|
28
|
+
@headers['Content-Type'] = agent.content_type
|
29
|
+
agent.output
|
30
|
+
else
|
31
|
+
@status = 500
|
32
|
+
'Error'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def agent
|
37
|
+
Scrappy::Agent[@request.env["REMOTE_ADDR"]]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'camping'
|
2
|
+
require 'camping/session'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
Camping.goes :Scrappy
|
6
|
+
|
7
|
+
module Scrappy
|
8
|
+
include Camping::Session
|
9
|
+
secret '1a36591bceec49c832079e270d7e8b73'
|
10
|
+
|
11
|
+
module Controllers
|
12
|
+
class Index
|
13
|
+
def get
|
14
|
+
mab do
|
15
|
+
html do
|
16
|
+
head {}
|
17
|
+
body do
|
18
|
+
h1 "Scrappy Web Server"
|
19
|
+
p "Use following URL format: http://[host]/[format]/[url]"
|
20
|
+
p do
|
21
|
+
"For example: " + a("http://localhost:3434/rdfxml/http://www.google.com",
|
22
|
+
:href=>"http://localhost:3434/rdfxml/http://www.google.com")
|
23
|
+
end
|
24
|
+
p do
|
25
|
+
"Remember to escape parameters: " +
|
26
|
+
"http://www.example.com/~user/%3Ftest%3D1%26test1%3D2<br/> or<br/> " +
|
27
|
+
"http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2<br/>" +
|
28
|
+
"instead of<br/> http://www.example.com/~user/?test=1&test1=2"
|
29
|
+
end
|
30
|
+
p do
|
31
|
+
"Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class Extract < R '/(\w+)/(.+)'
|
40
|
+
include InputEscaping
|
41
|
+
|
42
|
+
def get format, url
|
43
|
+
process_request :get, format, url
|
44
|
+
end
|
45
|
+
|
46
|
+
def post format, url
|
47
|
+
process_request :post, format, url
|
48
|
+
end
|
49
|
+
|
50
|
+
protected
|
51
|
+
def process_request http_method, format, url
|
52
|
+
callback = @input['callback']
|
53
|
+
agent.proxy http_method, url, @input.reject{|k,v| k=='callback'}, format.to_sym
|
54
|
+
|
55
|
+
case agent.status
|
56
|
+
when :redirect
|
57
|
+
redirect "/#{format}/#{agent.uri}#{inputs}"
|
58
|
+
when :ok
|
59
|
+
@headers['Content-Type'] = agent.content_type
|
60
|
+
callback ? "#{callback}(#{agent.output})" : agent.output
|
61
|
+
else
|
62
|
+
@status = 500
|
63
|
+
'Error'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def agent
|
68
|
+
return @agent if @agent
|
69
|
+
if @state[:agent].nil? || @state[:token] != SESSION_TOKEN
|
70
|
+
@state[:token] = SESSION_TOKEN
|
71
|
+
@state[:agent] = Scrappy::Agent.create.id
|
72
|
+
end
|
73
|
+
@agent = Scrappy::Agent[@state[:agent]]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Scrappy
|
2
|
+
class Shell
|
3
|
+
def initialize file=nil
|
4
|
+
@agent = Agent.create
|
5
|
+
@file = file
|
6
|
+
end
|
7
|
+
|
8
|
+
def run
|
9
|
+
commands = ['get', 'put', 'help']
|
10
|
+
|
11
|
+
Readline.completion_append_character = " "
|
12
|
+
Readline.completer_word_break_characters = ""
|
13
|
+
Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
|
14
|
+
|
15
|
+
if @file
|
16
|
+
open(@file, 'r').lines.each do |line|
|
17
|
+
break if process(line) == :quit
|
18
|
+
end
|
19
|
+
else
|
20
|
+
begin
|
21
|
+
line = Readline.readline(bash, true)
|
22
|
+
code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
|
23
|
+
end while code != :quit
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
protected
|
28
|
+
def process raw_command
|
29
|
+
command = raw_command.strip
|
30
|
+
|
31
|
+
code = if command =~ /\Aget\W(.*)\Z/
|
32
|
+
puts @agent.proxy :get, $1
|
33
|
+
puts ''
|
34
|
+
elsif command == 'help'
|
35
|
+
puts 'Available commands:'
|
36
|
+
puts ' get URL: Visit the specified URL'
|
37
|
+
puts ' help: Show this information'
|
38
|
+
puts ' quit: Exit scrappy shell'
|
39
|
+
puts ''
|
40
|
+
elsif command == 'quit'
|
41
|
+
:quit
|
42
|
+
elsif command == '' or command[0..0] == '#'
|
43
|
+
nil
|
44
|
+
else
|
45
|
+
puts "ERROR: Unknown command '#{command}'"
|
46
|
+
puts ''
|
47
|
+
end
|
48
|
+
code
|
49
|
+
end
|
50
|
+
|
51
|
+
def bash
|
52
|
+
return '' if Options.quiet
|
53
|
+
location = if @agent.uri
|
54
|
+
uri = URI::parse(@agent.uri)
|
55
|
+
path = uri.path.to_s
|
56
|
+
path = path[0..0] + "..." + path[-16..-1] if path.size > 20
|
57
|
+
if uri.query
|
58
|
+
query = "?" + uri.query
|
59
|
+
query = "?..." + query[-10..-1] if query.size > 13
|
60
|
+
else
|
61
|
+
query = ""
|
62
|
+
end
|
63
|
+
"#{uri.base}#{path}#{query}"
|
64
|
+
else
|
65
|
+
''
|
66
|
+
end
|
67
|
+
"#{location}$ "
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'net/http'
|
3
|
+
require 'net/https'
|
4
|
+
|
5
|
+
module URI
|
6
|
+
def base
|
7
|
+
self.to_s.split('/')[0..2] * '/'
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
module Scrappy
|
12
|
+
module InputEscaping
|
13
|
+
def inputs
|
14
|
+
return '' if @input.empty?
|
15
|
+
"?" + (@input.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'gtk2'
|
2
|
+
module Gtk
|
3
|
+
module WebKit
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'rbwebkitgtk.so'
|
8
|
+
|
9
|
+
class Gtk::WebKit::WebView
|
10
|
+
alias :load_html_string_no_defaults :load_html_string
|
11
|
+
def load_html_string(content, base_uri=nil)
|
12
|
+
load_html_string_no_defaults(content, base_uri)
|
13
|
+
end
|
14
|
+
|
15
|
+
def mark_text_matches(test, case_sensitive=false, limit=0)
|
16
|
+
mark_text_matches_with_limit(test, case_sensitive, limit)
|
17
|
+
end
|
18
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scrappy
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
version: "0.1"
|
9
|
+
platform: ruby
|
10
|
+
authors:
|
11
|
+
- Jose Ignacio
|
12
|
+
autorequire:
|
13
|
+
bindir: bin
|
14
|
+
cert_chain: []
|
15
|
+
|
16
|
+
date: 2010-10-07 00:00:00 +02:00
|
17
|
+
default_executable:
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: activesupport
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
segments:
|
27
|
+
- 2
|
28
|
+
- 3
|
29
|
+
- 5
|
30
|
+
version: 2.3.5
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: markaby
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
segments:
|
41
|
+
- 0
|
42
|
+
- 7
|
43
|
+
- 1
|
44
|
+
version: 0.7.1
|
45
|
+
type: :runtime
|
46
|
+
version_requirements: *id002
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: camping
|
49
|
+
prerelease: false
|
50
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 2
|
56
|
+
- 0
|
57
|
+
version: "2.0"
|
58
|
+
type: :runtime
|
59
|
+
version_requirements: *id003
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: nokogiri
|
62
|
+
prerelease: false
|
63
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 1
|
69
|
+
- 4
|
70
|
+
- 1
|
71
|
+
version: 1.4.1
|
72
|
+
type: :runtime
|
73
|
+
version_requirements: *id004
|
74
|
+
- !ruby/object:Gem::Dependency
|
75
|
+
name: mechanize
|
76
|
+
prerelease: false
|
77
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
segments:
|
82
|
+
- 1
|
83
|
+
- 0
|
84
|
+
- 0
|
85
|
+
version: 1.0.0
|
86
|
+
type: :runtime
|
87
|
+
version_requirements: *id005
|
88
|
+
- !ruby/object:Gem::Dependency
|
89
|
+
name: lightrdf
|
90
|
+
prerelease: false
|
91
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
segments:
|
96
|
+
- 0
|
97
|
+
- 1
|
98
|
+
version: "0.1"
|
99
|
+
type: :runtime
|
100
|
+
version_requirements: *id006
|
101
|
+
- !ruby/object:Gem::Dependency
|
102
|
+
name: rubyforge
|
103
|
+
prerelease: false
|
104
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
segments:
|
109
|
+
- 2
|
110
|
+
- 0
|
111
|
+
- 4
|
112
|
+
version: 2.0.4
|
113
|
+
type: :development
|
114
|
+
version_requirements: *id007
|
115
|
+
- !ruby/object:Gem::Dependency
|
116
|
+
name: hoe
|
117
|
+
prerelease: false
|
118
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
segments:
|
123
|
+
- 2
|
124
|
+
- 6
|
125
|
+
- 0
|
126
|
+
version: 2.6.0
|
127
|
+
type: :development
|
128
|
+
version_requirements: *id008
|
129
|
+
description: |-
|
130
|
+
Scrappy is a tool that allows extracting information from web pages and producing RDF data.
|
131
|
+
It uses the scraping ontology to define the mappings between HTML contents and RDF data.
|
132
|
+
|
133
|
+
An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
|
134
|
+
|
135
|
+
dc: http://purl.org/dc/elements/1.1/
|
136
|
+
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
|
137
|
+
sioc: http://rdfs.org/sioc/ns#
|
138
|
+
sc: http://lab.gsi.dit.upm.es/scraping.rdf#
|
139
|
+
*:
|
140
|
+
rdf:type: sc:Fragment
|
141
|
+
sc:selector:
|
142
|
+
*:
|
143
|
+
rdf:type: sc:UriSelector
|
144
|
+
rdf:value: "http://www.elmundo.es/"
|
145
|
+
sc:identifier:
|
146
|
+
*:
|
147
|
+
rdf:type: sc:BaseUriSelector
|
148
|
+
sc:subfragment:
|
149
|
+
*:
|
150
|
+
sc:type: sioc:Post
|
151
|
+
sc:selector:
|
152
|
+
*:
|
153
|
+
rdf:type: sc:CssSelector
|
154
|
+
rdf:value: ".noticia h2, .noticia h3, .noticia h4"
|
155
|
+
sc:identifier:
|
156
|
+
*:
|
157
|
+
rdf:type: sc:CssSelector
|
158
|
+
rdf:value: "a"
|
159
|
+
sc:attribute: "href"
|
160
|
+
sc:subfragment:
|
161
|
+
*:
|
162
|
+
sc:type: rdf:Literal
|
163
|
+
sc:relation: dc:title
|
164
|
+
sc:selector:
|
165
|
+
*:
|
166
|
+
rdf:type: sc:CssSelector
|
167
|
+
rdf:value: "a"
|
168
|
+
|
169
|
+
(The above code is serialized using YARF format, supported by LightRDF gem, as well as
|
170
|
+
RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
|
171
|
+
email:
|
172
|
+
- joseignacio.fernandez@gmail.com
|
173
|
+
executables:
|
174
|
+
- scrappy
|
175
|
+
extensions: []
|
176
|
+
|
177
|
+
extra_rdoc_files:
|
178
|
+
- History.txt
|
179
|
+
- Manifest.txt
|
180
|
+
files:
|
181
|
+
- History.txt
|
182
|
+
- Manifest.txt
|
183
|
+
- README.rdoc
|
184
|
+
- Rakefile
|
185
|
+
- bin/scrappy
|
186
|
+
- kb/elmundo.yarf
|
187
|
+
- lib/scrappy.rb
|
188
|
+
- lib/scrappy/agent/agent.rb
|
189
|
+
- lib/scrappy/agent/blind_agent.rb
|
190
|
+
- lib/scrappy/agent/cluster.rb
|
191
|
+
- lib/scrappy/agent/extractor.rb
|
192
|
+
- lib/scrappy/agent/visual_agent.rb
|
193
|
+
- lib/scrappy/proxy.rb
|
194
|
+
- lib/scrappy/server.rb
|
195
|
+
- lib/scrappy/shell.rb
|
196
|
+
- lib/scrappy/support.rb
|
197
|
+
- lib/scrappy/webkit/webkit.rb
|
198
|
+
- test/test_helper.rb
|
199
|
+
- test/test_scrappy.rb
|
200
|
+
has_rdoc: true
|
201
|
+
homepage: http://github.com/josei/scrappy
|
202
|
+
licenses: []
|
203
|
+
|
204
|
+
post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
|
205
|
+
rdoc_options:
|
206
|
+
- --main
|
207
|
+
- README.rdoc
|
208
|
+
require_paths:
|
209
|
+
- lib
|
210
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
211
|
+
requirements:
|
212
|
+
- - ">="
|
213
|
+
- !ruby/object:Gem::Version
|
214
|
+
segments:
|
215
|
+
- 0
|
216
|
+
version: "0"
|
217
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
218
|
+
requirements:
|
219
|
+
- - ">="
|
220
|
+
- !ruby/object:Gem::Version
|
221
|
+
segments:
|
222
|
+
- 0
|
223
|
+
version: "0"
|
224
|
+
requirements: []
|
225
|
+
|
226
|
+
rubyforge_project: scrappy
|
227
|
+
rubygems_version: 1.3.6
|
228
|
+
signing_key:
|
229
|
+
specification_version: 3
|
230
|
+
summary: Web scraper that allows producing RDF data out of plain web pages
|
231
|
+
test_files:
|
232
|
+
- test/test_scrappy.rb
|
233
|
+
- test/test_helper.rb
|