scrappy 0.1.22 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.1.23 2011-03-03
2
+
3
+ * Prettier web interface
4
+
1
5
  === 0.1.22 2011-02-25
2
6
 
3
7
  * Support for reverse proxy in web server
data/Manifest CHANGED
@@ -14,7 +14,6 @@ lib/scrappy/agent/formats.rb
14
14
  lib/scrappy/agent/map_reduce.rb
15
15
  lib/scrappy/agent/extractor.rb
16
16
  lib/scrappy/agent/visual_agent.rb
17
- lib/scrappy/proxy.rb
18
17
  lib/scrappy/selectors/base_uri.rb
19
18
  lib/scrappy/selectors/css.rb
20
19
  lib/scrappy/selectors/new_uri.rb
@@ -24,10 +23,15 @@ lib/scrappy/selectors/slice.rb
24
23
  lib/scrappy/selectors/uri.rb
25
24
  lib/scrappy/selectors/uri_pattern.rb
26
25
  lib/scrappy/selectors/xpath.rb
27
- lib/scrappy/server.rb
26
+ lib/scrappy/server/proxy.rb
27
+ lib/scrappy/server/server.rb
28
+ lib/scrappy/server/public/images/logo.png
29
+ lib/scrappy/server/public/images/logo_small.png
30
+ lib/scrappy/server/public/stylesheets/application.css
31
+ lib/scrappy/server/views/home.haml
32
+ lib/scrappy/server/views/help.haml
28
33
  lib/scrappy/shell.rb
29
34
  lib/scrappy/support.rb
30
- lib/scrappy/views/home.haml
31
35
  lib/scrappy/webkit/webkit.rb
32
36
  test/test_helper.rb
33
37
  test/test_scrappy.rb
data/bin/scrappy CHANGED
@@ -36,7 +36,7 @@ module Scrappy
36
36
  opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
37
37
  opts.on('-u', '--debug') { Agent::Options.debug = true }
38
38
  opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
39
- opts.on('-s', '--server [ROOT]') { |url| Options.server = true; Options.root = url }
39
+ opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
40
40
  opts.on('-S', '--proxy-server') { Options.proxy = true }
41
41
  opts.on('-P P', '--port P') { |p| Options.port = p }
42
42
  opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
@@ -57,15 +57,15 @@ module Scrappy
57
57
  Options.quiet = true
58
58
  puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
59
59
  elsif Options.proxy
60
- puts "Launching Scrappy Web Proxy..."
61
- require 'scrappy/proxy'
60
+ puts "Launching Scrappy Web Proxy (set http://localhost:#{Options.port} as proxy)..."
61
+ require 'scrappy/server/proxy'
62
62
  Thin::Logging.silent = true
63
63
  Scrappy::Proxy.run! :host => 'localhost', :port => Options.port, :environment=>:production
64
64
  elsif Options.server
65
- puts "Launching Scrappy Web Server..."
66
- require 'scrappy/server'
65
+ puts "Launching Scrappy Web Server (browse http://localhost:#{Options.port})..."
66
+ require 'scrappy/server/server'
67
67
  Thin::Logging.silent = true
68
- Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment=>:production, :root=>Options.root
68
+ Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment=>:production, :base_uri=>Options.base_uri
69
69
  elsif Options.shell
70
70
  puts "Launching Scrappy Shell..."
71
71
  require 'scrappy/shell'
File without changes
@@ -0,0 +1,40 @@
1
+ body {
2
+ font-family: Arial, sans;
3
+ }
4
+ .center {
5
+ text-align: center;
6
+ margin-top: 100px;
7
+ }
8
+ .search {
9
+ margin-top: 40px;
10
+ margin-bottom: 100px;
11
+ font-size:20px;
12
+ }
13
+ .search input {
14
+ width: 400px; height:30px; font-size:16px;
15
+ }
16
+ .search select {
17
+ width: 80px; height: 30px; font-size:16px;
18
+ }
19
+ .search button {
20
+ width: 80px; height: 30px; font-size:16px;
21
+ }
22
+ pre {
23
+ width: 600px;
24
+ margin-left: auto;
25
+ margin-right: auto;
26
+ border: 1px solid;
27
+ padding:10px;
28
+ }
29
+ #header {
30
+ margin: auto; width: 800px; padding: 15px;
31
+ border-bottom: 1px solid;
32
+ margin-top: 20px; font-size: 14px; color: #555;
33
+ }
34
+ #body {
35
+ margin: auto; width: 800px; padding: 15px;
36
+ font-size: 14px; color: #555;
37
+ }
38
+ #footer {
39
+ margin-top:30px; text-align: center; font-size:14px; color: #555;
40
+ }
@@ -5,12 +5,22 @@ require 'haml'
5
5
  module Scrappy
6
6
  class Server < Sinatra::Base
7
7
  enable :sessions
8
- set :views, File.dirname(__FILE__) + '/views'
8
+ set :root, File.dirname(__FILE__)
9
+ set :views, Proc.new { File.join(root, "views") }
10
+ set :public, Proc.new { File.join(root, "public") }
9
11
 
10
12
  get '/' do
11
- haml :home
13
+ if params[:format] and params[:uri]
14
+ redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
15
+ else
16
+ haml :home
17
+ end
12
18
  end
13
-
19
+
20
+ get '/help' do
21
+ haml :help
22
+ end
23
+
14
24
  get '/:format/*' do |format, url|
15
25
  process_request :get, format, url, params[:callback]
16
26
  end
@@ -24,7 +34,7 @@ module Scrappy
24
34
  response = agent.proxy :method=>method, :uri=>url, :inputs=>inputs, :format=>format.to_sym
25
35
  case response.status
26
36
  when :redirect
27
- redirect "#{settings.root}/#{format}/#{CGI::escape(response.uri).gsub('%2F','/').gsub('%3A',':')}#{textual_inputs}"
37
+ redirect "#{settings.base_uri}/#{format}/#{simplify_uri(response.uri)}#{textual_inputs}"
28
38
  when :ok
29
39
  headers 'Content-Type' => response.content_type
30
40
  callback ? "#{callback}(#{response.output})" : response.output
@@ -51,5 +61,9 @@ module Scrappy
51
61
  return '' if inputs.merge('callback'=>params[:callback]).reject{|k,v| v.nil?}.empty?
52
62
  "?" + (inputs.merge('callback'=>params[:callback]).reject{|k,v| v.nil?}.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
53
63
  end
64
+
65
+ def simplify_uri uri
66
+ CGI::escape(uri).gsub('%2F','/').gsub('%3A',':')
67
+ end
54
68
  end
55
69
  end
@@ -0,0 +1,25 @@
1
+ !!!
2
+ %html
3
+ %head
4
+ %title Help - Scrappy
5
+ %link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
6
+ %body
7
+ %div#header
8
+ %img{:src=>'/images/logo_small.png'}
9
+ %div#body
10
+ %h1 Help
11
+ %p
12
+ Scrappy web server is a web interface to Scrappy functionalities.
13
+ You can use it to get data from a web resource and integrate it with other system.
14
+ The service uses the following URL format:
15
+ %pre http://[host]/[format]/[url]
16
+ %p
17
+ For example, to retrieve http://example.com/~user/?test=1 with RDF serialization:
18
+ %pre http://localhost:3434/rdf/http://example.com/~user/%3Ftest%3D1
19
+ %p Available serialization formats are rdf, rdfxml, png, yarf, ntriples, turtle, json, jsonrdf, and ejson.
20
+ %div#footer
21
+ %a{:href=>"/"} Home
22
+ |
23
+ %a{:href=>"/help"} Help
24
+ |
25
+ %a{:href=>'http://github.com/josei/scrappy'} About
@@ -0,0 +1,25 @@
1
+ !!!
2
+ %html
3
+ %head
4
+ %title Scrappy
5
+ %link{:type=>"text/css", :href=>"/stylesheets/application.css", :rel=>"stylesheet"}
6
+ %body
7
+ %div.center
8
+ %a{:href=>'http://github.com/josei/scrappy'}
9
+ %img{:src=>'/images/logo.png'}
10
+ %form.search
11
+ %div
12
+ %select{:name=>:format}
13
+ %option{:value=>:rdf} RDF
14
+ %option{:value=>:png} PNG
15
+ %option{:value=>:ejson} JSON
16
+ %option{:value=>:yarf} YARF
17
+ %option{:value=>:ntriples} nTriples
18
+ %input{:name=>:uri, :size=>30, :type=>:text, :value=>'Enter URI...', :onclick=>"if(this.value=='Enter URI...') this.value='';"}
19
+ %button Scrape
20
+ %div#footer
21
+ %a{:href=>"/"} Home
22
+ |
23
+ %a{:href=>"/help"} Help
24
+ |
25
+ %a{:href=>'http://github.com/josei/scrappy'} About
data/lib/scrappy.rb CHANGED
@@ -21,7 +21,7 @@ require 'scrappy/agent/agent'
21
21
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
22
22
 
23
23
  module Scrappy
24
- VERSION = '0.1.22'
24
+ VERSION = '0.1.23'
25
25
  end
26
26
 
27
27
  # Require selectors
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.22"
5
+ s.version = "0.1.23"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-02-25}
9
+ s.date = %q{2011-03-03}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/views/home.haml", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/views/home.haml", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/proxy.rb", "lib/scrappy/server/server.rb", "lib/scrappy/server/public/images/logo.png", "lib/scrappy/server/public/images/logo_small.png", "lib/scrappy/server/public/stylesheets/application.css", "lib/scrappy/server/views/home.haml", "lib/scrappy/server/views/help.haml", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 22
9
- version: 0.1.22
8
+ - 23
9
+ version: 0.1.23
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-25 00:00:00 +01:00
17
+ date: 2011-03-03 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -148,7 +148,6 @@ extra_rdoc_files:
148
148
  - lib/scrappy/agent/map_reduce.rb
149
149
  - lib/scrappy/agent/extractor.rb
150
150
  - lib/scrappy/agent/visual_agent.rb
151
- - lib/scrappy/proxy.rb
152
151
  - lib/scrappy/selectors/base_uri.rb
153
152
  - lib/scrappy/selectors/css.rb
154
153
  - lib/scrappy/selectors/new_uri.rb
@@ -158,10 +157,15 @@ extra_rdoc_files:
158
157
  - lib/scrappy/selectors/uri.rb
159
158
  - lib/scrappy/selectors/uri_pattern.rb
160
159
  - lib/scrappy/selectors/xpath.rb
161
- - lib/scrappy/server.rb
160
+ - lib/scrappy/server/proxy.rb
161
+ - lib/scrappy/server/server.rb
162
+ - lib/scrappy/server/public/images/logo.png
163
+ - lib/scrappy/server/public/images/logo_small.png
164
+ - lib/scrappy/server/public/stylesheets/application.css
165
+ - lib/scrappy/server/views/home.haml
166
+ - lib/scrappy/server/views/help.haml
162
167
  - lib/scrappy/shell.rb
163
168
  - lib/scrappy/support.rb
164
- - lib/scrappy/views/home.haml
165
169
  - lib/scrappy/webkit/webkit.rb
166
170
  files:
167
171
  - History.txt
@@ -180,7 +184,6 @@ files:
180
184
  - lib/scrappy/agent/map_reduce.rb
181
185
  - lib/scrappy/agent/extractor.rb
182
186
  - lib/scrappy/agent/visual_agent.rb
183
- - lib/scrappy/proxy.rb
184
187
  - lib/scrappy/selectors/base_uri.rb
185
188
  - lib/scrappy/selectors/css.rb
186
189
  - lib/scrappy/selectors/new_uri.rb
@@ -190,10 +193,15 @@ files:
190
193
  - lib/scrappy/selectors/uri.rb
191
194
  - lib/scrappy/selectors/uri_pattern.rb
192
195
  - lib/scrappy/selectors/xpath.rb
193
- - lib/scrappy/server.rb
196
+ - lib/scrappy/server/proxy.rb
197
+ - lib/scrappy/server/server.rb
198
+ - lib/scrappy/server/public/images/logo.png
199
+ - lib/scrappy/server/public/images/logo_small.png
200
+ - lib/scrappy/server/public/stylesheets/application.css
201
+ - lib/scrappy/server/views/home.haml
202
+ - lib/scrappy/server/views/help.haml
194
203
  - lib/scrappy/shell.rb
195
204
  - lib/scrappy/support.rb
196
- - lib/scrappy/views/home.haml
197
205
  - lib/scrappy/webkit/webkit.rb
198
206
  - test/test_helper.rb
199
207
  - test/test_scrappy.rb
@@ -1,23 +0,0 @@
1
- !!!
2
- %html
3
- %head
4
- %title Scrappy Web Server
5
- %body
6
- %h1 Scrappy Web Server
7
- %p Use following URL format: http://[host]/[format]/[url]
8
- %p
9
- For example:
10
- %a{:href=>"http://localhost:3434/rdfxml/http://www.google.com"}
11
- http://localhost:3434/rdfxml/http://www.google.com
12
-
13
- Remember to escape parameters: http://www.example.com/~user/%3Ftest%3D1%26test1%3D2
14
- %br
15
- or
16
- %br
17
- http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2
18
- %br
19
- instead of
20
- %br
21
- http://www.example.com/~user/?test=1&test1=2"
22
-
23
- %p Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson