scrappy 0.1.22 → 0.1.23

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.1.23 2011-03-03
2
+
3
+ * Prettier web interface
4
+
1
5
  === 0.1.22 2011-02-25
2
6
 
3
7
  * Support for reverse proxy in web server
data/Manifest CHANGED
@@ -14,7 +14,6 @@ lib/scrappy/agent/formats.rb
14
14
  lib/scrappy/agent/map_reduce.rb
15
15
  lib/scrappy/agent/extractor.rb
16
16
  lib/scrappy/agent/visual_agent.rb
17
- lib/scrappy/proxy.rb
18
17
  lib/scrappy/selectors/base_uri.rb
19
18
  lib/scrappy/selectors/css.rb
20
19
  lib/scrappy/selectors/new_uri.rb
@@ -24,10 +23,15 @@ lib/scrappy/selectors/slice.rb
24
23
  lib/scrappy/selectors/uri.rb
25
24
  lib/scrappy/selectors/uri_pattern.rb
26
25
  lib/scrappy/selectors/xpath.rb
27
- lib/scrappy/server.rb
26
+ lib/scrappy/server/proxy.rb
27
+ lib/scrappy/server/server.rb
28
+ lib/scrappy/server/public/images/logo.png
29
+ lib/scrappy/server/public/images/logo_small.png
30
+ lib/scrappy/server/public/stylesheets/application.css
31
+ lib/scrappy/server/views/home.haml
32
+ lib/scrappy/server/views/help.haml
28
33
  lib/scrappy/shell.rb
29
34
  lib/scrappy/support.rb
30
- lib/scrappy/views/home.haml
31
35
  lib/scrappy/webkit/webkit.rb
32
36
  test/test_helper.rb
33
37
  test/test_scrappy.rb
data/bin/scrappy CHANGED
@@ -36,7 +36,7 @@ module Scrappy
36
36
  opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
37
37
  opts.on('-u', '--debug') { Agent::Options.debug = true }
38
38
  opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
39
- opts.on('-s', '--server [ROOT]') { |url| Options.server = true; Options.root = url }
39
+ opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
40
40
  opts.on('-S', '--proxy-server') { Options.proxy = true }
41
41
  opts.on('-P P', '--port P') { |p| Options.port = p }
42
42
  opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
@@ -57,15 +57,15 @@ module Scrappy
57
57
  Options.quiet = true
58
58
  puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
59
59
  elsif Options.proxy
60
- puts "Launching Scrappy Web Proxy..."
61
- require 'scrappy/proxy'
60
+ puts "Launching Scrappy Web Proxy (set http://localhost:#{Options.port} as proxy)..."
61
+ require 'scrappy/server/proxy'
62
62
  Thin::Logging.silent = true
63
63
  Scrappy::Proxy.run! :host => 'localhost', :port => Options.port, :environment=>:production
64
64
  elsif Options.server
65
- puts "Launching Scrappy Web Server..."
66
- require 'scrappy/server'
65
+ puts "Launching Scrappy Web Server (browse http://localhost:#{Options.port})..."
66
+ require 'scrappy/server/server'
67
67
  Thin::Logging.silent = true
68
- Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment=>:production, :root=>Options.root
68
+ Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment=>:production, :base_uri=>Options.base_uri
69
69
  elsif Options.shell
70
70
  puts "Launching Scrappy Shell..."
71
71
  require 'scrappy/shell'
File without changes
@@ -0,0 +1,40 @@
1
+ body {
2
+ font-family: Arial, sans;
3
+ }
4
+ .center {
5
+ text-align: center;
6
+ margin-top: 100px;
7
+ }
8
+ .search {
9
+ margin-top: 40px;
10
+ margin-bottom: 100px;
11
+ font-size:20px;
12
+ }
13
+ .search input {
14
+ width: 400px; height:30px; font-size:16px;
15
+ }
16
+ .search select {
17
+ width: 80px; height: 30px; font-size:16px;
18
+ }
19
+ .search button {
20
+ width: 80px; height: 30px; font-size:16px;
21
+ }
22
+ pre {
23
+ width: 600px;
24
+ margin-left: auto;
25
+ margin-right: auto;
26
+ border: 1px solid;
27
+ padding:10px;
28
+ }
29
+ #header {
30
+ margin: auto; width: 800px; padding: 15px;
31
+ border-bottom: 1px solid;
32
+ margin-top: 20px; font-size: 14px; color: #555;
33
+ }
34
+ #body {
35
+ margin: auto; width: 800px; padding: 15px;
36
+ font-size: 14px; color: #555;
37
+ }
38
+ #footer {
39
+ margin-top:30px; text-align: center; font-size:14px; color: #555;
40
+ }
@@ -5,12 +5,22 @@ require 'haml'
5
5
  module Scrappy
6
6
  class Server < Sinatra::Base
7
7
  enable :sessions
8
- set :views, File.dirname(__FILE__) + '/views'
8
+ set :root, File.dirname(__FILE__)
9
+ set :views, Proc.new { File.join(root, "views") }
10
+ set :public, Proc.new { File.join(root, "public") }
9
11
 
10
12
  get '/' do
11
- haml :home
13
+ if params[:format] and params[:uri]
14
+ redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
15
+ else
16
+ haml :home
17
+ end
12
18
  end
13
-
19
+
20
+ get '/help' do
21
+ haml :help
22
+ end
23
+
14
24
  get '/:format/*' do |format, url|
15
25
  process_request :get, format, url, params[:callback]
16
26
  end
@@ -24,7 +34,7 @@ module Scrappy
24
34
  response = agent.proxy :method=>method, :uri=>url, :inputs=>inputs, :format=>format.to_sym
25
35
  case response.status
26
36
  when :redirect
27
- redirect "#{settings.root}/#{format}/#{CGI::escape(response.uri).gsub('%2F','/').gsub('%3A',':')}#{textual_inputs}"
37
+ redirect "#{settings.base_uri}/#{format}/#{simplify_uri(response.uri)}#{textual_inputs}"
28
38
  when :ok
29
39
  headers 'Content-Type' => response.content_type
30
40
  callback ? "#{callback}(#{response.output})" : response.output
@@ -51,5 +61,9 @@ module Scrappy
51
61
  return '' if inputs.merge('callback'=>params[:callback]).reject{|k,v| v.nil?}.empty?
52
62
  "?" + (inputs.merge('callback'=>params[:callback]).reject{|k,v| v.nil?}.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
53
63
  end
64
+
65
+ def simplify_uri uri
66
+ CGI::escape(uri).gsub('%2F','/').gsub('%3A',':')
67
+ end
54
68
  end
55
69
  end
@@ -0,0 +1,25 @@
1
+ !!!
2
+ %html
3
+ %head
4
+ %title Help - Scrappy
5
+ %link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
6
+ %body
7
+ %div#header
8
+ %img{:src=>'/images/logo_small.png'}
9
+ %div#body
10
+ %h1 Help
11
+ %p
12
+ Scrappy web server is a web interface to Scrappy functionalities.
13
+ You can use it to get data from a web resource and integrate it with other system.
14
+ The service uses the following URL format:
15
+ %pre http://[host]/[format]/[url]
16
+ %p
17
+ For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
18
+ %pre http://localhost:3434/rdf/http://example.com/~user/%3Ftest%3D1
19
+ %p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
20
+ %div#footer
21
+ %a{:href=>"/"} Home
22
+ |
23
+ %a{:href=>"/help"} Help
24
+ |
25
+ %a{:href=>'http://github.com/josei/scrappy'} About
@@ -0,0 +1,25 @@
1
+ !!!
2
+ %html
3
+ %head
4
+ %title Scrappy
5
+ %link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
6
+ %body
7
+ %div.center
8
+ %a{:href=>'http://github.com/josei/scrappy'}
9
+ %img{:src=>'/images/logo.png'}
10
+ %form.search
11
+ %div
12
+ %select{:name=>:format}
13
+ %option{:value=>:rdf} RDF
14
+ %option{:value=>:png} PNG
15
+ %option{:value=>:ejson} JSON
16
+ %option{:value=>:yarf} YARF
17
+ %option{:value=>:ntriples} nTriples
18
+ %input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
19
+ %button Scrape
20
+ %div#footer
21
+ %a{:href=>"/"} Home
22
+ |
23
+ %a{:href=>"/help"} Help
24
+ |
25
+ %a{:href=>'http://github.com/josei/scrappy'} About
data/lib/scrappy.rb CHANGED
@@ -21,7 +21,7 @@ require 'scrappy/agent/agent'
21
21
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
22
22
 
23
23
  module Scrappy
24
- VERSION = '0.1.22'
24
+ VERSION = '0.1.23'
25
25
  end
26
26
 
27
27
  # Require selectors
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.22"
5
+ s.version = "0.1.23"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-02-25}
9
+ s.date = %q{2011-03-03}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/views/home.haml", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/views/home.haml", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 22
9
- version: 0.1.22
8
+ - 23
9
+ version: 0.1.23
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-25 00:00:00 +01:00
17
+ date: 2011-03-03 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -148,7 +148,6 @@ extra_rdoc_files:
148
148
  - lib/scrappy/agent/map_reduce.rb
149
149
  - lib/scrappy/agent/extractor.rb
150
150
  - lib/scrappy/agent/visual_agent.rb
151
- - lib/scrappy/proxy.rb
152
151
  - lib/scrappy/selectors/base_uri.rb
153
152
  - lib/scrappy/selectors/css.rb
154
153
  - lib/scrappy/selectors/new_uri.rb
@@ -158,10 +157,15 @@ extra_rdoc_files:
158
157
  - lib/scrappy/selectors/uri.rb
159
158
  - lib/scrappy/selectors/uri_pattern.rb
160
159
  - lib/scrappy/selectors/xpath.rb
161
- - lib/scrappy/server.rb
160
+ - lib/scrappy/server/proxy.rb
161
+ - lib/scrappy/server/server.rb
162
+ - lib/scrappy/server/public/images/logo.png
163
+ - lib/scrappy/server/public/images/logo_small.png
164
+ - lib/scrappy/server/public/stylesheets/application.css
165
+ - lib/scrappy/server/views/home.haml
166
+ - lib/scrappy/server/views/help.haml
162
167
  - lib/scrappy/shell.rb
163
168
  - lib/scrappy/support.rb
164
- - lib/scrappy/views/home.haml
165
169
  - lib/scrappy/webkit/webkit.rb
166
170
  files:
167
171
  - History.txt
@@ -180,7 +184,6 @@ files:
180
184
  - lib/scrappy/agent/map_reduce.rb
181
185
  - lib/scrappy/agent/extractor.rb
182
186
  - lib/scrappy/agent/visual_agent.rb
183
- - lib/scrappy/proxy.rb
184
187
  - lib/scrappy/selectors/base_uri.rb
185
188
  - lib/scrappy/selectors/css.rb
186
189
  - lib/scrappy/selectors/new_uri.rb
@@ -190,10 +193,15 @@ files:
190
193
  - lib/scrappy/selectors/uri.rb
191
194
  - lib/scrappy/selectors/uri_pattern.rb
192
195
  - lib/scrappy/selectors/xpath.rb
193
- - lib/scrappy/server.rb
196
+ - lib/scrappy/server/proxy.rb
197
+ - lib/scrappy/server/server.rb
198
+ - lib/scrappy/server/public/images/logo.png
199
+ - lib/scrappy/server/public/images/logo_small.png
200
+ - lib/scrappy/server/public/stylesheets/application.css
201
+ - lib/scrappy/server/views/home.haml
202
+ - lib/scrappy/server/views/help.haml
194
203
  - lib/scrappy/shell.rb
195
204
  - lib/scrappy/support.rb
196
- - lib/scrappy/views/home.haml
197
205
  - lib/scrappy/webkit/webkit.rb
198
206
  - test/test_helper.rb
199
207
  - test/test_scrappy.rb
@@ -1,23 +0,0 @@
1
- !!!
2
- %html
3
- %head
4
- %title Scrappy Web Server
5
- %body
6
- %h1 Scrappy Web Server
7
- %p Use following URL format: http://[host]/[format]/[url]
8
- %p
9
- For example:
10
- %a{:href=>"http://localhost:3434/rdfxml/http://www.google.com"}
11
- http://localhost:3434/rdfxml/http://www.google.com
12
-
13
- Remember to escape parameters: http://www.example.com/~user/%3Ftest%3D1%26test1%3D2
14
- %br
15
- or
16
- %br
17
- http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2
18
- %br
19
- instead of
20
- %br
21
- http://www.example.com/~user/?test=1&test1=2"
22
-
23
- %p Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson