scrappy 0.1.22 → 0.1.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest +7 -3
- data/bin/scrappy +6 -6
- data/lib/scrappy/{proxy.rb → server/proxy.rb} +0 -0
- data/lib/scrappy/server/public/images/logo.png +0 -0
- data/lib/scrappy/server/public/images/logo_small.png +0 -0
- data/lib/scrappy/server/public/stylesheets/application.css +40 -0
- data/lib/scrappy/{server.rb → server/server.rb} +18 -4
- data/lib/scrappy/server/views/help.haml +25 -0
- data/lib/scrappy/server/views/home.haml +25 -0
- data/lib/scrappy.rb +1 -1
- data/scrappy.gemspec +4 -4
- metadata +17 -9
- data/lib/scrappy/views/home.haml +0 -23
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -14,7 +14,6 @@ lib/scrappy/agent/formats.rb
|
|
14
14
|
lib/scrappy/agent/map_reduce.rb
|
15
15
|
lib/scrappy/agent/extractor.rb
|
16
16
|
lib/scrappy/agent/visual_agent.rb
|
17
|
-
lib/scrappy/proxy.rb
|
18
17
|
lib/scrappy/selectors/base_uri.rb
|
19
18
|
lib/scrappy/selectors/css.rb
|
20
19
|
lib/scrappy/selectors/new_uri.rb
|
@@ -24,10 +23,15 @@ lib/scrappy/selectors/slice.rb
|
|
24
23
|
lib/scrappy/selectors/uri.rb
|
25
24
|
lib/scrappy/selectors/uri_pattern.rb
|
26
25
|
lib/scrappy/selectors/xpath.rb
|
27
|
-
lib/scrappy/server.rb
|
26
|
+
lib/scrappy/server/proxy.rb
|
27
|
+
lib/scrappy/server/server.rb
|
28
|
+
lib/scrappy/server/public/images/logo.png
|
29
|
+
lib/scrappy/server/public/images/logo_small.png
|
30
|
+
lib/scrappy/server/public/stylesheets/application.css
|
31
|
+
lib/scrappy/server/views/home.haml
|
32
|
+
lib/scrappy/server/views/help.haml
|
28
33
|
lib/scrappy/shell.rb
|
29
34
|
lib/scrappy/support.rb
|
30
|
-
lib/scrappy/views/home.haml
|
31
35
|
lib/scrappy/webkit/webkit.rb
|
32
36
|
test/test_helper.rb
|
33
37
|
test/test_scrappy.rb
|
data/bin/scrappy
CHANGED
@@ -36,7 +36,7 @@ module Scrappy
|
|
36
36
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
37
37
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
38
38
|
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
39
|
-
opts.on('-s', '--server [
|
39
|
+
opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
|
40
40
|
opts.on('-S', '--proxy-server') { Options.proxy = true }
|
41
41
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
42
42
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
@@ -57,15 +57,15 @@ module Scrappy
|
|
57
57
|
Options.quiet = true
|
58
58
|
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
|
59
59
|
elsif Options.proxy
|
60
|
-
puts "Launching Scrappy Web Proxy..."
|
61
|
-
require 'scrappy/proxy'
|
60
|
+
puts "Launching Scrappy Web Proxy (set http://localhost:#{Options.port} as proxy)..."
|
61
|
+
require 'scrappy/server/proxy'
|
62
62
|
Thin::Logging.silent = true
|
63
63
|
Scrappy::Proxy.run! :host => 'localhost', :port => Options.port, :environment=>:production
|
64
64
|
elsif Options.server
|
65
|
-
puts "Launching Scrappy Web Server..."
|
66
|
-
require 'scrappy/server'
|
65
|
+
puts "Launching Scrappy Web Server (browse http://localhost:#{Options.port})..."
|
66
|
+
require 'scrappy/server/server'
|
67
67
|
Thin::Logging.silent = true
|
68
|
-
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment=>:production, :
|
68
|
+
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment=>:production, :base_uri=>Options.base_uri
|
69
69
|
elsif Options.shell
|
70
70
|
puts "Launching Scrappy Shell..."
|
71
71
|
require 'scrappy/shell'
|
File without changes
|
Binary file
|
Binary file
|
@@ -0,0 +1,40 @@
|
|
1
|
+
body {
|
2
|
+
font-family: Arial, sans;
|
3
|
+
}
|
4
|
+
.center {
|
5
|
+
text-align: center;
|
6
|
+
margin-top: 100px;
|
7
|
+
}
|
8
|
+
.search {
|
9
|
+
margin-top: 40px;
|
10
|
+
margin-bottom: 100px;
|
11
|
+
font-size:20px;
|
12
|
+
}
|
13
|
+
.search input {
|
14
|
+
width: 400px; height:30px; font-size:16px;
|
15
|
+
}
|
16
|
+
.search select {
|
17
|
+
width: 80px; height: 30px; font-size:16px;
|
18
|
+
}
|
19
|
+
.search button {
|
20
|
+
width: 80px; height: 30px; font-size:16px;
|
21
|
+
}
|
22
|
+
pre {
|
23
|
+
width: 600px;
|
24
|
+
margin-left: auto;
|
25
|
+
margin-right: auto;
|
26
|
+
border: 1px solid;
|
27
|
+
padding:10px;
|
28
|
+
}
|
29
|
+
#header {
|
30
|
+
margin: auto; width: 800px; padding: 15px;
|
31
|
+
border-bottom: 1px solid;
|
32
|
+
margin-top: 20px; font-size: 14px; color: #555;
|
33
|
+
}
|
34
|
+
#body {
|
35
|
+
margin: auto; width: 800px; padding: 15px;
|
36
|
+
font-size: 14px; color: #555;
|
37
|
+
}
|
38
|
+
#footer {
|
39
|
+
margin-top:30px; text-align: center; font-size:14px; color: #555;
|
40
|
+
}
|
@@ -5,12 +5,22 @@ require 'haml'
|
|
5
5
|
module Scrappy
|
6
6
|
class Server < Sinatra::Base
|
7
7
|
enable :sessions
|
8
|
-
set :
|
8
|
+
set :root, File.dirname(__FILE__)
|
9
|
+
set :views, Proc.new { File.join(root, "views") }
|
10
|
+
set :public, Proc.new { File.join(root, "public") }
|
9
11
|
|
10
12
|
get '/' do
|
11
|
-
|
13
|
+
if params[:format] and params[:uri]
|
14
|
+
redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
|
15
|
+
else
|
16
|
+
haml :home
|
17
|
+
end
|
12
18
|
end
|
13
|
-
|
19
|
+
|
20
|
+
get '/help' do
|
21
|
+
haml :help
|
22
|
+
end
|
23
|
+
|
14
24
|
get '/:format/*' do |format, url|
|
15
25
|
process_request :get, format, url, params[:callback]
|
16
26
|
end
|
@@ -24,7 +34,7 @@ module Scrappy
|
|
24
34
|
response = agent.proxy :method=>method, :uri=>url, :inputs=>inputs, :format=>format.to_sym
|
25
35
|
case response.status
|
26
36
|
when :redirect
|
27
|
-
redirect "#{settings.
|
37
|
+
redirect "#{settings.base_uri}/#{format}/#{simplify_uri(response.uri)}#{textual_inputs}"
|
28
38
|
when :ok
|
29
39
|
headers 'Content-Type' => response.content_type
|
30
40
|
callback ? "#{callback}(#{response.output})" : response.output
|
@@ -51,5 +61,9 @@ module Scrappy
|
|
51
61
|
return '' if inputs.merge('callback'=>params[:callback]).reject{|k,v| v.nil?}.empty?
|
52
62
|
"?" + (inputs.merge('callback'=>params[:callback]).reject{|k,v| v.nil?}.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
|
53
63
|
end
|
64
|
+
|
65
|
+
def simplify_uri uri
|
66
|
+
CGI::escape(uri).gsub('%2F','/').gsub('%3A',':')
|
67
|
+
end
|
54
68
|
end
|
55
69
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
!!!
|
2
|
+
%html
|
3
|
+
%head
|
4
|
+
%title Help - Scrappy
|
5
|
+
%link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
+
%body
|
7
|
+
%div#header
|
8
|
+
%img{:src=>'/images/logo_small.png'}
|
9
|
+
%div#body
|
10
|
+
%h1 Help
|
11
|
+
%p
|
12
|
+
Scrappy web server is a web interface to Scrappy functionalities.
|
13
|
+
You can use it to get data from a web resource and integrate it with other system.
|
14
|
+
The service uses the following URL format:
|
15
|
+
%pre http://[host]/[format]/[url]
|
16
|
+
%p
|
17
|
+
For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
|
18
|
+
%pre http://localhost:3434/rdf/http://example.com/~user/%3Ftest%3D1
|
19
|
+
%p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
|
20
|
+
%div#footer
|
21
|
+
%a{:href=>"/"} Home
|
22
|
+
|
|
23
|
+
%a{:href=>"/help"} Help
|
24
|
+
|
|
25
|
+
%a{:href=>'http://github.com/josei/scrappy'} About
|
@@ -0,0 +1,25 @@
|
|
1
|
+
!!!
|
2
|
+
%html
|
3
|
+
%head
|
4
|
+
%title Scrappy
|
5
|
+
%link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
+
%body
|
7
|
+
%div.center
|
8
|
+
%a{:href=>'http://github.com/josei/scrappy'}
|
9
|
+
%img{:src=>'/images/logo.png'}
|
10
|
+
%form.search
|
11
|
+
%div
|
12
|
+
%select{:name=>:format}
|
13
|
+
%option{:value=>:rdf} RDF
|
14
|
+
%option{:value=>:png} PNG
|
15
|
+
%option{:value=>:ejson} JSON
|
16
|
+
%option{:value=>:yarf} YARF
|
17
|
+
%option{:value=>:ntriples} nTriples
|
18
|
+
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
19
|
+
%button Scrape
|
20
|
+
%div#footer
|
21
|
+
%a{:href=>"/"} Home
|
22
|
+
|
|
23
|
+
%a{:href=>"/help"} Help
|
24
|
+
|
|
25
|
+
%a{:href=>'http://github.com/josei/scrappy'} About
|
data/lib/scrappy.rb
CHANGED
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.23"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-03-03}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 23
|
9
|
+
version: 0.1.23
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-03-03 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -148,7 +148,6 @@ extra_rdoc_files:
|
|
148
148
|
- lib/scrappy/agent/map_reduce.rb
|
149
149
|
- lib/scrappy/agent/extractor.rb
|
150
150
|
- lib/scrappy/agent/visual_agent.rb
|
151
|
-
- lib/scrappy/proxy.rb
|
152
151
|
- lib/scrappy/selectors/base_uri.rb
|
153
152
|
- lib/scrappy/selectors/css.rb
|
154
153
|
- lib/scrappy/selectors/new_uri.rb
|
@@ -158,10 +157,15 @@ extra_rdoc_files:
|
|
158
157
|
- lib/scrappy/selectors/uri.rb
|
159
158
|
- lib/scrappy/selectors/uri_pattern.rb
|
160
159
|
- lib/scrappy/selectors/xpath.rb
|
161
|
-
- lib/scrappy/server.rb
|
160
|
+
- lib/scrappy/server/proxy.rb
|
161
|
+
- lib/scrappy/server/server.rb
|
162
|
+
- lib/scrappy/server/public/images/logo.png
|
163
|
+
- lib/scrappy/server/public/images/logo_small.png
|
164
|
+
- lib/scrappy/server/public/stylesheets/application.css
|
165
|
+
- lib/scrappy/server/views/home.haml
|
166
|
+
- lib/scrappy/server/views/help.haml
|
162
167
|
- lib/scrappy/shell.rb
|
163
168
|
- lib/scrappy/support.rb
|
164
|
-
- lib/scrappy/views/home.haml
|
165
169
|
- lib/scrappy/webkit/webkit.rb
|
166
170
|
files:
|
167
171
|
- History.txt
|
@@ -180,7 +184,6 @@ files:
|
|
180
184
|
- lib/scrappy/agent/map_reduce.rb
|
181
185
|
- lib/scrappy/agent/extractor.rb
|
182
186
|
- lib/scrappy/agent/visual_agent.rb
|
183
|
-
- lib/scrappy/proxy.rb
|
184
187
|
- lib/scrappy/selectors/base_uri.rb
|
185
188
|
- lib/scrappy/selectors/css.rb
|
186
189
|
- lib/scrappy/selectors/new_uri.rb
|
@@ -190,10 +193,15 @@ files:
|
|
190
193
|
- lib/scrappy/selectors/uri.rb
|
191
194
|
- lib/scrappy/selectors/uri_pattern.rb
|
192
195
|
- lib/scrappy/selectors/xpath.rb
|
193
|
-
- lib/scrappy/server.rb
|
196
|
+
- lib/scrappy/server/proxy.rb
|
197
|
+
- lib/scrappy/server/server.rb
|
198
|
+
- lib/scrappy/server/public/images/logo.png
|
199
|
+
- lib/scrappy/server/public/images/logo_small.png
|
200
|
+
- lib/scrappy/server/public/stylesheets/application.css
|
201
|
+
- lib/scrappy/server/views/home.haml
|
202
|
+
- lib/scrappy/server/views/help.haml
|
194
203
|
- lib/scrappy/shell.rb
|
195
204
|
- lib/scrappy/support.rb
|
196
|
-
- lib/scrappy/views/home.haml
|
197
205
|
- lib/scrappy/webkit/webkit.rb
|
198
206
|
- test/test_helper.rb
|
199
207
|
- test/test_scrappy.rb
|
data/lib/scrappy/views/home.haml
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
!!!
|
2
|
-
%html
|
3
|
-
%head
|
4
|
-
%title Scrappy Web Server
|
5
|
-
%body
|
6
|
-
%h1 Scrappy Web Server
|
7
|
-
%p Use following URL format: http://[host]/[format]/[url]
|
8
|
-
%p
|
9
|
-
For example:
|
10
|
-
%a{:href=>"http://localhost:3434/rdfxml/http://www.google.com"}
|
11
|
-
http://localhost:3434/rdfxml/http://www.google.com
|
12
|
-
|
13
|
-
Remember to escape parameters: http://www.example.com/~user/%3Ftest%3D1%26test1%3D2
|
14
|
-
%br
|
15
|
-
or
|
16
|
-
%br
|
17
|
-
http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2
|
18
|
-
%br
|
19
|
-
instead of
|
20
|
-
%br
|
21
|
-
http://www.example.com/~user/?test=1&test1=2"
|
22
|
-
|
23
|
-
%p Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson
|