scrappy 0.1.22 → 0.1.23
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest +7 -3
- data/bin/scrappy +6 -6
- data/lib/scrappy/{proxy.rb → server/proxy.rb} +0 -0
- data/lib/scrappy/server/public/images/logo.png +0 -0
- data/lib/scrappy/server/public/images/logo_small.png +0 -0
- data/lib/scrappy/server/public/stylesheets/application.css +40 -0
- data/lib/scrappy/{server.rb → server/server.rb} +18 -4
- data/lib/scrappy/server/views/help.haml +25 -0
- data/lib/scrappy/server/views/home.haml +25 -0
- data/lib/scrappy.rb +1 -1
- data/scrappy.gemspec +4 -4
- metadata +17 -9
- data/lib/scrappy/views/home.haml +0 -23
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -14,7 +14,6 @@ lib/scrappy/agent/formats.rb
|
|
14
14
|
lib/scrappy/agent/map_reduce.rb
|
15
15
|
lib/scrappy/agent/extractor.rb
|
16
16
|
lib/scrappy/agent/visual_agent.rb
|
17
|
-
lib/scrappy/proxy.rb
|
18
17
|
lib/scrappy/selectors/base_uri.rb
|
19
18
|
lib/scrappy/selectors/css.rb
|
20
19
|
lib/scrappy/selectors/new_uri.rb
|
@@ -24,10 +23,15 @@ lib/scrappy/selectors/slice.rb
|
|
24
23
|
lib/scrappy/selectors/uri.rb
|
25
24
|
lib/scrappy/selectors/uri_pattern.rb
|
26
25
|
lib/scrappy/selectors/xpath.rb
|
27
|
-
lib/scrappy/server.rb
|
26
|
+
lib/scrappy/server/proxy.rb
|
27
|
+
lib/scrappy/server/server.rb
|
28
|
+
lib/scrappy/server/public/images/logo.png
|
29
|
+
lib/scrappy/server/public/images/logo_small.png
|
30
|
+
lib/scrappy/server/public/stylesheets/application.css
|
31
|
+
lib/scrappy/server/views/home.haml
|
32
|
+
lib/scrappy/server/views/help.haml
|
28
33
|
lib/scrappy/shell.rb
|
29
34
|
lib/scrappy/support.rb
|
30
|
-
lib/scrappy/views/home.haml
|
31
35
|
lib/scrappy/webkit/webkit.rb
|
32
36
|
test/test_helper.rb
|
33
37
|
test/test_scrappy.rb
|
data/bin/scrappy
CHANGED
@@ -36,7 +36,7 @@ module Scrappy
|
|
36
36
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
37
37
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
38
38
|
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
39
|
-
opts.on('-s', '--server [
|
39
|
+
opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
|
40
40
|
opts.on('-S', '--proxy-server') { Options.proxy = true }
|
41
41
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
42
42
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
@@ -57,15 +57,15 @@ module Scrappy
|
|
57
57
|
Options.quiet = true
|
58
58
|
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
|
59
59
|
elsif Options.proxy
|
60
|
-
puts "Launching Scrappy Web Proxy..."
|
61
|
-
require 'scrappy/proxy'
|
60
|
+
puts "Launching Scrappy Web Proxy (set http://localhost:#{Options.port} as proxy)..."
|
61
|
+
require 'scrappy/server/proxy'
|
62
62
|
Thin::Logging.silent = true
|
63
63
|
Scrappy::Proxy.run! :host => 'localhost', :port => Options.port, :environment=>:production
|
64
64
|
elsif Options.server
|
65
|
-
puts "Launching Scrappy Web Server..."
|
66
|
-
require 'scrappy/server'
|
65
|
+
puts "Launching Scrappy Web Server (browse http://localhost:#{Options.port})..."
|
66
|
+
require 'scrappy/server/server'
|
67
67
|
Thin::Logging.silent = true
|
68
|
-
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment=>:production, :
|
68
|
+
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment=>:production, :base_uri=>Options.base_uri
|
69
69
|
elsif Options.shell
|
70
70
|
puts "Launching Scrappy Shell..."
|
71
71
|
require 'scrappy/shell'
|
File without changes
|
Binary file
|
Binary file
|
@@ -0,0 +1,40 @@
|
|
1
|
+
body {
|
2
|
+
font-family: Arial, sans;
|
3
|
+
}
|
4
|
+
.center {
|
5
|
+
text-align: center;
|
6
|
+
margin-top: 100px;
|
7
|
+
}
|
8
|
+
.search {
|
9
|
+
margin-top: 40px;
|
10
|
+
margin-bottom: 100px;
|
11
|
+
font-size:20px;
|
12
|
+
}
|
13
|
+
.search input {
|
14
|
+
width: 400px; height:30px; font-size:16px;
|
15
|
+
}
|
16
|
+
.search select {
|
17
|
+
width: 80px; height: 30px; font-size:16px;
|
18
|
+
}
|
19
|
+
.search button {
|
20
|
+
width: 80px; height: 30px; font-size:16px;
|
21
|
+
}
|
22
|
+
pre {
|
23
|
+
width: 600px;
|
24
|
+
margin-left: auto;
|
25
|
+
margin-right: auto;
|
26
|
+
border: 1px solid;
|
27
|
+
padding:10px;
|
28
|
+
}
|
29
|
+
#header {
|
30
|
+
margin: auto; width: 800px; padding: 15px;
|
31
|
+
border-bottom: 1px solid;
|
32
|
+
margin-top: 20px; font-size: 14px; color: #555;
|
33
|
+
}
|
34
|
+
#body {
|
35
|
+
margin: auto; width: 800px; padding: 15px;
|
36
|
+
font-size: 14px; color: #555;
|
37
|
+
}
|
38
|
+
#footer {
|
39
|
+
margin-top:30px; text-align: center; font-size:14px; color: #555;
|
40
|
+
}
|
@@ -5,12 +5,22 @@ require 'haml'
|
|
5
5
|
module Scrappy
|
6
6
|
class Server < Sinatra::Base
|
7
7
|
enable :sessions
|
8
|
-
set :
|
8
|
+
set :root, File.dirname(__FILE__)
|
9
|
+
set :views, Proc.new { File.join(root, "views") }
|
10
|
+
set :public, Proc.new { File.join(root, "public") }
|
9
11
|
|
10
12
|
get '/' do
|
11
|
-
|
13
|
+
if params[:format] and params[:uri]
|
14
|
+
redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
|
15
|
+
else
|
16
|
+
haml :home
|
17
|
+
end
|
12
18
|
end
|
13
|
-
|
19
|
+
|
20
|
+
get '/help' do
|
21
|
+
haml :help
|
22
|
+
end
|
23
|
+
|
14
24
|
get '/:format/*' do |format, url|
|
15
25
|
process_request :get, format, url, params[:callback]
|
16
26
|
end
|
@@ -24,7 +34,7 @@ module Scrappy
|
|
24
34
|
response = agent.proxy :method=>method, :uri=>url, :inputs=>inputs, :format=>format.to_sym
|
25
35
|
case response.status
|
26
36
|
when :redirect
|
27
|
-
redirect "#{settings.
|
37
|
+
redirect "#{settings.base_uri}/#{format}/#{simplify_uri(response.uri)}#{textual_inputs}"
|
28
38
|
when :ok
|
29
39
|
headers 'Content-Type' => response.content_type
|
30
40
|
callback ? "#{callback}(#{response.output})" : response.output
|
@@ -51,5 +61,9 @@ module Scrappy
|
|
51
61
|
return '' if inputs.merge('callback'=>params[:callback]).reject{|k,v| v.nil?}.empty?
|
52
62
|
"?" + (inputs.merge('callback'=>params[:callback]).reject{|k,v| v.nil?}.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
|
53
63
|
end
|
64
|
+
|
65
|
+
def simplify_uri uri
|
66
|
+
CGI::escape(uri).gsub('%2F','/').gsub('%3A',':')
|
67
|
+
end
|
54
68
|
end
|
55
69
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
!!!
|
2
|
+
%html
|
3
|
+
%head
|
4
|
+
%title Help - Scrappy
|
5
|
+
%link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
+
%body
|
7
|
+
%div#header
|
8
|
+
%img{:src=>'/images/logo_small.png'}
|
9
|
+
%div#body
|
10
|
+
%h1 Help
|
11
|
+
%p
|
12
|
+
Scrappy web server is a web interface to Scrappy functionalities.
|
13
|
+
You can use it to get data from a web resource and integrate it with other system.
|
14
|
+
The service uses the following URL format:
|
15
|
+
%pre http://[host]/[format]/[url]
|
16
|
+
%p
|
17
|
+
For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
|
18
|
+
%pre http://localhost:3434/rdf/http://example.com/~user/%3Ftest%3D1
|
19
|
+
%p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
|
20
|
+
%div#footer
|
21
|
+
%a{:href=>"/"} Home
|
22
|
+
|
|
23
|
+
%a{:href=>"/help"} Help
|
24
|
+
|
|
25
|
+
%a{:href=>'http://github.com/josei/scrappy'} About
|
@@ -0,0 +1,25 @@
|
|
1
|
+
!!!
|
2
|
+
%html
|
3
|
+
%head
|
4
|
+
%title Scrappy
|
5
|
+
%link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
|
6
|
+
%body
|
7
|
+
%div.center
|
8
|
+
%a{:href=>'http://github.com/josei/scrappy'}
|
9
|
+
%img{:src=>'/images/logo.png'}
|
10
|
+
%form.search
|
11
|
+
%div
|
12
|
+
%select{:name=>:format}
|
13
|
+
%option{:value=>:rdf} RDF
|
14
|
+
%option{:value=>:png} PNG
|
15
|
+
%option{:value=>:ejson} JSON
|
16
|
+
%option{:value=>:yarf} YARF
|
17
|
+
%option{:value=>:ntriples} nTriples
|
18
|
+
%input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
|
19
|
+
%button Scrape
|
20
|
+
%div#footer
|
21
|
+
%a{:href=>"/"} Home
|
22
|
+
|
|
23
|
+
%a{:href=>"/help"} Help
|
24
|
+
|
|
25
|
+
%a{:href=>'http://github.com/josei/scrappy'} About
|
data/lib/scrappy.rb
CHANGED
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.23"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-03-03}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 23
|
9
|
+
version: 0.1.23
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-03-03 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -148,7 +148,6 @@ extra_rdoc_files:
|
|
148
148
|
- lib/scrappy/agent/map_reduce.rb
|
149
149
|
- lib/scrappy/agent/extractor.rb
|
150
150
|
- lib/scrappy/agent/visual_agent.rb
|
151
|
-
- lib/scrappy/proxy.rb
|
152
151
|
- lib/scrappy/selectors/base_uri.rb
|
153
152
|
- lib/scrappy/selectors/css.rb
|
154
153
|
- lib/scrappy/selectors/new_uri.rb
|
@@ -158,10 +157,15 @@ extra_rdoc_files:
|
|
158
157
|
- lib/scrappy/selectors/uri.rb
|
159
158
|
- lib/scrappy/selectors/uri_pattern.rb
|
160
159
|
- lib/scrappy/selectors/xpath.rb
|
161
|
-
- lib/scrappy/server.rb
|
160
|
+
- lib/scrappy/server/proxy.rb
|
161
|
+
- lib/scrappy/server/server.rb
|
162
|
+
- lib/scrappy/server/public/images/logo.png
|
163
|
+
- lib/scrappy/server/public/images/logo_small.png
|
164
|
+
- lib/scrappy/server/public/stylesheets/application.css
|
165
|
+
- lib/scrappy/server/views/home.haml
|
166
|
+
- lib/scrappy/server/views/help.haml
|
162
167
|
- lib/scrappy/shell.rb
|
163
168
|
- lib/scrappy/support.rb
|
164
|
-
- lib/scrappy/views/home.haml
|
165
169
|
- lib/scrappy/webkit/webkit.rb
|
166
170
|
files:
|
167
171
|
- History.txt
|
@@ -180,7 +184,6 @@ files:
|
|
180
184
|
- lib/scrappy/agent/map_reduce.rb
|
181
185
|
- lib/scrappy/agent/extractor.rb
|
182
186
|
- lib/scrappy/agent/visual_agent.rb
|
183
|
-
- lib/scrappy/proxy.rb
|
184
187
|
- lib/scrappy/selectors/base_uri.rb
|
185
188
|
- lib/scrappy/selectors/css.rb
|
186
189
|
- lib/scrappy/selectors/new_uri.rb
|
@@ -190,10 +193,15 @@ files:
|
|
190
193
|
- lib/scrappy/selectors/uri.rb
|
191
194
|
- lib/scrappy/selectors/uri_pattern.rb
|
192
195
|
- lib/scrappy/selectors/xpath.rb
|
193
|
-
- lib/scrappy/server.rb
|
196
|
+
- lib/scrappy/server/proxy.rb
|
197
|
+
- lib/scrappy/server/server.rb
|
198
|
+
- lib/scrappy/server/public/images/logo.png
|
199
|
+
- lib/scrappy/server/public/images/logo_small.png
|
200
|
+
- lib/scrappy/server/public/stylesheets/application.css
|
201
|
+
- lib/scrappy/server/views/home.haml
|
202
|
+
- lib/scrappy/server/views/help.haml
|
194
203
|
- lib/scrappy/shell.rb
|
195
204
|
- lib/scrappy/support.rb
|
196
|
-
- lib/scrappy/views/home.haml
|
197
205
|
- lib/scrappy/webkit/webkit.rb
|
198
206
|
- test/test_helper.rb
|
199
207
|
- test/test_scrappy.rb
|
data/lib/scrappy/views/home.haml
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
!!!
|
2
|
-
%html
|
3
|
-
%head
|
4
|
-
%title Scrappy Web Server
|
5
|
-
%body
|
6
|
-
%h1 Scrappy Web Server
|
7
|
-
%p Use following URL format: http://[host]/[format]/[url]
|
8
|
-
%p
|
9
|
-
For example:
|
10
|
-
%a{:href=>"http://localhost:3434/rdfxml/http://www.google.com"}
|
11
|
-
http://localhost:3434/rdfxml/http://www.google.com
|
12
|
-
|
13
|
-
Remember to escape parameters: http://www.example.com/~user/%3Ftest%3D1%26test1%3D2
|
14
|
-
%br
|
15
|
-
or
|
16
|
-
%br
|
17
|
-
http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2
|
18
|
-
%br
|
19
|
-
instead of
|
20
|
-
%br
|
21
|
-
http://www.example.com/~user/?test=1&test1=2"
|
22
|
-
|
23
|
-
%p Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson
|