scrappy 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,8 @@
1
+ === 0.3.0 2011-03-11
2
+
3
+ * Removed unused features: shell, browser, scripting
4
+ * Correction in RootSelector
5
+
1
6
  === 0.2.1 2011-03-11
2
7
 
3
8
  * Added a web admin interface mode
data/Manifest CHANGED
@@ -12,7 +12,6 @@ lib/scrappy/agent/dumper.rb
12
12
  lib/scrappy/agent/extractor.rb
13
13
  lib/scrappy/agent/formats.rb
14
14
  lib/scrappy/agent/map_reduce.rb
15
- lib/scrappy/agent/visual_agent.rb
16
15
  lib/scrappy/repository.rb
17
16
  lib/scrappy/selectors/base_uri.rb
18
17
  lib/scrappy/selectors/css.rb
@@ -24,11 +23,10 @@ lib/scrappy/selectors/uri.rb
24
23
  lib/scrappy/selectors/uri_pattern.rb
25
24
  lib/scrappy/selectors/xpath.rb
26
25
  lib/scrappy/server/admin.rb
26
+ lib/scrappy/server/errors.rb
27
27
  lib/scrappy/server/helpers.rb
28
28
  lib/scrappy/server/server.rb
29
- lib/scrappy/shell.rb
30
29
  lib/scrappy/support.rb
31
- lib/scrappy/webkit/webkit.rb
32
30
  public/favicon.ico
33
31
  public/images/logo.png
34
32
  public/images/logo_tiny.png
@@ -58,62 +58,23 @@ scrappy offers many different interfaces to get RDF data from a web page:
58
58
 
59
59
  * Command-line interface:
60
60
 
61
- $ scrappy -g elmundo.es
62
-
63
- * Interactive shell:
64
-
65
- $ scrappy -i
66
- Launching scrappy Shell...
67
- $ get elmundo.es
68
- dc: http://purl.org/dc/elements/1.1/
69
- owl: http://www.w3.org/2002/07/owl#
70
- rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
71
- sc: http://lab.gsi.dit.upm.es/scraping.rdf#
72
- rdfs: http://www.w3.org/2000/01/rdf-schema#
73
- http://www.elmundo.es/elmundo/2010/10/05/gentes/1286310993.html:
74
- dc:description: "Las vacaciones del n\u00famero uno"
75
- dc:title:
76
- "Una suite de 5.000 euros para Nadal en Tailandia"
77
- "Una suite de 5.000 euros para Nadal"
78
- rdf:type: http://rdfs.org/sioc/ns#Post
79
- dc:creator: "Fernando Domingo | John Bali (V\u00eddeo)"
80
- http://www.daml.org/experiment/ontology/location-ont#location:
81
- *:
82
- rdf:label: "Bangkok"
83
- rdf:type: http://www.daml.org/experiment/ontology/location-ont#Location
84
- dc:date: "mi\u00e9rcoles 06/10/2010"
85
- ...
86
-
87
- http://www.elmundo.es$
61
+ $ scrappy -g example.com
88
62
 
89
- * Web Service interface:
63
+ * Web Admin interface:
90
64
 
91
- $ scrappy -s
92
- Launching scrappy Web Server...
93
- ** Starting Mongrel on localhost:3434
65
+ $ scrappy -a
66
+ Launching Scrappy Web Admin (browse http://localhost:3434)...
67
+ == Sinatra/1.1.3 has taken the stage on 3434 for production with backup from Thin
94
68
 
95
69
  Then point your browser to http://localhost:3434 for additional directions.
96
70
 
97
- * Web Proxy interface:
98
-
99
- $ scrappy -S
100
- Launching scrappy Web Proxy...
101
- ** Starting Mongrel on localhost:3434
102
-
103
- Then configure your browser's HTTP proxy to http://localhost:3434 and browse http://www.elmundo.es
104
-
105
- * Scripting (experimental):
106
-
107
- You can create scripts that retrieve many web pages and run them using scrappy.
108
-
109
- #!/usr/bin/scrappy
110
- get elmundo.es
111
- get google.com/search?q=testing
71
+ * Web Service interface:
112
72
 
113
- Then you can run your script from the command line just as any other bash script.
73
+ $ scrappy -s
74
+ Launching Scrappy Web Server...
75
+ == Sinatra/1.1.3 has taken the stage on 3434 for production with backup from Thin
114
76
 
115
- We plan to enable complex operations such as posting forms and definining a useful language
116
- with variables to enable flow control in order to build web service mashups.
77
+ Then use the service in the same way as the Web Admin but for read-only operations.
117
78
 
118
79
  * Ruby interface:
119
80
 
@@ -129,7 +90,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
129
90
  Scrappy::Agent::Options.kb = kb
130
91
 
131
92
  # Create an agent
132
- agent = Scrappy::Agent.create
93
+ agent = Scrappy::Agent.new
133
94
 
134
95
  # Get RDF output
135
96
  output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
@@ -38,17 +38,14 @@ module Scrappy
38
38
  opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
39
39
  opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
40
40
  opts.on('-u', '--debug') { Agent::Options.debug = true }
41
- opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
42
41
  opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
43
42
  opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
44
43
  opts.on('-P P', '--port P') { |p| Options.port = p }
45
44
  opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
46
45
  opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
47
46
  opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
48
- opts.on('-V', '--visual') { Agent::Options.agent = :visual }
49
47
  opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
50
48
  opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
51
- opts.on('-w', '--window') { Agent::Options.window = true }
52
49
  opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
53
50
  opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
54
51
  opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
@@ -60,11 +57,11 @@ module Scrappy
60
57
  onload
61
58
  if Options.uri
62
59
  Options.quiet = true
63
- puts Agent.create.proxy(:http_method=>:get, :uri=>Options.uri).output
60
+ puts Agent.new.proxy(:http_method=>:get, :uri=>Options.uri).output
64
61
  elsif Options.observe
65
- Agent.create.observe(Options.observe)
62
+ Agent.new.observe(Options.observe)
66
63
  elsif Options.admin
67
- puts "Launching Scrappy Admin Web Server (browse http://localhost:#{Options.port})..."
64
+ puts "Launching Scrappy Web Admin (browse http://localhost:#{Options.port})..."
68
65
  require 'scrappy/server/server'
69
66
  Thin::Logging.silent = true
70
67
  Scrappy::Server.register Scrappy::Admin
@@ -76,14 +73,10 @@ module Scrappy
76
73
  Thin::Logging.silent = true
77
74
  Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
78
75
  :base_uri => Options.base_uri
79
- elsif Options.shell
80
- puts "Launching Scrappy Shell..."
81
- require 'scrappy/shell'
82
- Shell.new.run
83
76
  else
84
- require 'scrappy/shell'
85
- Options.quiet = true
86
- Shell.new(@file).run
77
+ output_version
78
+ puts 'To get help use: scrappy -h'
79
+ exit 0
87
80
  end
88
81
  Scrappy::App.quit
89
82
  end
@@ -106,7 +99,7 @@ Usage
106
99
  Options
107
100
  -h, --help Displays help message
108
101
  -v, --version Display the version, then exit
109
- -f, --format Picks output format (json, ejson, rdfxml, ntriples, png)
102
+ -f, --format Picks output format (json, ejson, rdf, ntriples, png)
110
103
  -g, --get URL Gets requested URL
111
104
  -p, --post URL Posts requested URL
112
105
  -c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
@@ -114,16 +107,13 @@ Options
114
107
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
115
108
  -D, --dump Dumps RDF data to disk
116
109
  -u, --debug Shows debugging traces
117
- -i, --interactive Runs interactive shell
118
110
  -o, --observe URLs Observes the specified URLs storing their data into the repository
119
111
  -s, --server [ROOT] Runs web server (optionally specify server's root url)
120
- -S, --proxy-server Runs web proxy
112
+ -a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
121
113
  -P, --port PORT Selects port number (default is 3434)
122
- -V, --visual Uses visual agent (slow)
123
- -t, --time DAYS Returns repository data from the last given minutes
114
+ -t, --time TIME Returns repository data from the last given minutes
124
115
  -r, --reference Outputs referenceable data
125
116
  -R, --reference-all Outputs all HTML referenceable data
126
- -w, --window Shows browser window (requires -v)
127
117
 
128
118
  Authors
129
119
  José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
@@ -17,12 +17,13 @@ require 'scrappy/agent/map_reduce'
17
17
  require 'scrappy/agent/cache'
18
18
  require 'scrappy/agent/dumper'
19
19
  require 'scrappy/agent/formats'
20
+ require 'scrappy/agent/blind_agent'
20
21
  require 'scrappy/agent/agent'
21
22
 
22
23
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
23
24
 
24
25
  module Scrappy
25
- VERSION = '0.2.1'
26
+ VERSION = '0.3.0'
26
27
  end
27
28
 
28
29
  # Require selectors
@@ -4,8 +4,9 @@ module Scrappy
4
4
  include Extractor
5
5
  include MapReduce
6
6
  include Cached
7
+ include BlindAgent
7
8
 
8
- Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
9
+ Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :delay=>0, :workers=>10
9
10
  ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
10
11
  :rdf => 'application/rdf+xml' }
11
12
 
@@ -13,17 +14,7 @@ module Scrappy
13
14
  @pool ||= {}
14
15
  end
15
16
  def self.[] id
16
- pool[id] || Agent.create(:id=>id)
17
- end
18
-
19
- def self.create args={}
20
- if (args[:agent] || Options.agent) == :visual
21
- require 'scrappy/agent/visual_agent'
22
- VisualAgent.new args
23
- else
24
- require 'scrappy/agent/blind_agent'
25
- BlindAgent.new args
26
- end
17
+ pool[id] || Agent.new(:id=>id)
27
18
  end
28
19
 
29
20
  attr_accessor :id, :options, :kb
@@ -160,7 +151,8 @@ module Scrappy
160
151
  end
161
152
 
162
153
  def clean triples
163
- triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
154
+ triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }.
155
+ select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
164
156
  end
165
157
 
166
158
  # Do the extraction using RDF repository
@@ -231,7 +223,6 @@ module Scrappy
231
223
  puts 'done!' if options.debug
232
224
 
233
225
  if self.html_data?
234
- add_visual_data! if options.referenceable # Adds tags including visual information
235
226
  triples = extract(self.uri, html, options.referenceable) # Extract data
236
227
  Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
237
228
  triples
@@ -1,7 +1,7 @@
1
1
  module Scrappy
2
- class BlindAgent < Agent
2
+ module BlindAgent
3
3
  def initialize args={}
4
- super
4
+ super()
5
5
  @mechanize = Mechanize.new
6
6
  @mechanize.max_history = 20
7
7
  end
@@ -36,8 +36,5 @@ module Scrappy
36
36
  def html
37
37
  @mechanize.current_page.root.to_html :encoding=>'UTF-8'
38
38
  end
39
-
40
- def add_visual_data!
41
- end
42
39
  end
43
40
  end
@@ -96,7 +96,7 @@ module Scrappy
96
96
  end
97
97
 
98
98
  def filter selector, doc
99
- if !selector.sc::debug.empty? and options.debug
99
+ if selector.sc::debug.first=="true" and options.debug
100
100
  puts '== DEBUG'
101
101
  puts '== Selector:'
102
102
  puts selector.serialize(:yarf, false)
@@ -109,7 +109,7 @@ module Scrappy
109
109
  # Process selector
110
110
  results = selector_pool(selector).filter doc
111
111
 
112
- if !selector.sc::debug.empty? and options.debug
112
+ if selector.sc::debug.first=="true" and options.debug
113
113
  puts "== No results" if results.empty?
114
114
  results.each_with_index do |result, i|
115
115
  puts "== Result ##{i}:"
@@ -25,9 +25,17 @@ module Scrappy
25
25
  doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
26
26
  doc.text.strip
27
27
  when Node('sc:Html') then
28
- node.to_html
28
+ if node.respond_to? :to_html
29
+ node.to_html
30
+ else
31
+ node.to_s
32
+ end
29
33
  else
30
- node.text
34
+ if node.respond_to? :text
35
+ node.text
36
+ else
37
+ node.to_s
38
+ end
31
39
  end
32
40
  end
33
41
 
@@ -8,7 +8,7 @@ module Sc
8
8
  # Select node's attribute if given
9
9
  sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
10
10
  else
11
- [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], sc::format, doc[:uri]) } ]
11
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
12
12
  end
13
13
  end
14
14
  end
@@ -0,0 +1,13 @@
1
+ module Scrappy
2
+ module Errors
3
+ def self.registered app
4
+ app.error do
5
+ "Internal error"
6
+ end
7
+
8
+ app.not_found do
9
+ "Resource not found"
10
+ end
11
+ end
12
+ end
13
+ end
@@ -3,10 +3,12 @@ require 'thin'
3
3
  require 'haml'
4
4
  require 'scrappy/server/helpers'
5
5
  require 'scrappy/server/admin'
6
+ require 'scrappy/server/errors'
6
7
 
7
8
  module Scrappy
8
9
  class Server < Sinatra::Base
9
10
  helpers JavaScriptHelpers
11
+ register Errors
10
12
 
11
13
  enable :sessions
12
14
  set :root, File.join(File.dirname(__FILE__), '..', '..', '..')
@@ -40,7 +42,7 @@ module Scrappy
40
42
  return @agent if @agent
41
43
  if session[:agent].nil? || session[:token] != SESSION_TOKEN
42
44
  session[:token] = SESSION_TOKEN
43
- session[:agent] = Scrappy::Agent.create.id
45
+ session[:agent] = Scrappy::Agent.new.id
44
46
  end
45
47
  @agent = Scrappy::Agent[session[:agent]]
46
48
  end
@@ -150,7 +150,7 @@ ul.detail li span {
150
150
  display: inline-block;
151
151
  }
152
152
  ul.detail li span.name {
153
- width: 600px;
153
+ width: 550px;
154
154
  overflow-x: hidden;
155
155
  font-family: monospace;
156
156
  font-size: 12px;
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.2.1"
5
+ s.version = "0.3.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
@@ -11,22 +11,22 @@ Gem::Specification.new do |s|
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/scrappy.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/help.haml", "views/home.haml", "views/kb.haml", "views/layout.haml", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/scrappy.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/help.haml", "views/home.haml", "views/kb.haml", "views/layout.haml", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
19
19
  s.require_paths = ["lib"]
20
20
  s.rubyforge_project = %q{scrappy}
21
- s.rubygems_version = %q{1.3.6}
21
+ s.rubygems_version = %q{1.3.7}
22
22
  s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
23
- s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
23
+ s.test_files = ["test/test_helper.rb", "test/test_scrappy.rb"]
24
24
 
25
25
  if s.respond_to? :specification_version then
26
26
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
27
27
  s.specification_version = 3
28
28
 
29
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
29
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
30
30
  s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
31
31
  s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
32
32
  s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
@@ -10,6 +10,6 @@
10
10
  -else
11
11
  =uri
12
12
  -if !uri.include?('*')
13
- -[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['PNG', :png]].reverse.each do |format, format_code|
13
+ -[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['nTriples', :ntriples], ['PNG', :png]].reverse.each do |format, format_code|
14
14
  %span.format
15
15
  %a{:href=>"#{settings.base_uri}/#{format_code}/#{uri}"}=format
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrappy
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 19
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
- - 2
8
- - 1
9
- version: 0.2.1
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
10
11
  platform: ruby
11
12
  authors:
12
13
  - Jose Ignacio
@@ -21,9 +22,11 @@ dependencies:
21
22
  name: activesupport
22
23
  prerelease: false
23
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
24
26
  requirements:
25
27
  - - ">="
26
28
  - !ruby/object:Gem::Version
29
+ hash: 9
27
30
  segments:
28
31
  - 2
29
32
  - 3
@@ -35,9 +38,11 @@ dependencies:
35
38
  name: sinatra
36
39
  prerelease: false
37
40
  requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
38
42
  requirements:
39
43
  - - ">="
40
44
  - !ruby/object:Gem::Version
45
+ hash: 23
41
46
  segments:
42
47
  - 1
43
48
  - 1
@@ -49,9 +54,11 @@ dependencies:
49
54
  name: thin
50
55
  prerelease: false
51
56
  requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
52
58
  requirements:
53
59
  - - ">="
54
60
  - !ruby/object:Gem::Version
61
+ hash: 17
55
62
  segments:
56
63
  - 1
57
64
  - 2
@@ -63,9 +70,11 @@ dependencies:
63
70
  name: nokogiri
64
71
  prerelease: false
65
72
  requirement: &id004 !ruby/object:Gem::Requirement
73
+ none: false
66
74
  requirements:
67
75
  - - ">="
68
76
  - !ruby/object:Gem::Version
77
+ hash: 5
69
78
  segments:
70
79
  - 1
71
80
  - 4
@@ -77,9 +86,11 @@ dependencies:
77
86
  name: mechanize
78
87
  prerelease: false
79
88
  requirement: &id005 !ruby/object:Gem::Requirement
89
+ none: false
80
90
  requirements:
81
91
  - - ">="
82
92
  - !ruby/object:Gem::Version
93
+ hash: 23
83
94
  segments:
84
95
  - 1
85
96
  - 0
@@ -91,9 +102,11 @@ dependencies:
91
102
  name: lightrdf
92
103
  prerelease: false
93
104
  requirement: &id006 !ruby/object:Gem::Requirement
105
+ none: false
94
106
  requirements:
95
107
  - - ">="
96
108
  - !ruby/object:Gem::Version
109
+ hash: 21
97
110
  segments:
98
111
  - 0
99
112
  - 2
@@ -105,9 +118,11 @@ dependencies:
105
118
  name: i18n
106
119
  prerelease: false
107
120
  requirement: &id007 !ruby/object:Gem::Requirement
121
+ none: false
108
122
  requirements:
109
123
  - - ">="
110
124
  - !ruby/object:Gem::Version
125
+ hash: 11
111
126
  segments:
112
127
  - 0
113
128
  - 4
@@ -119,9 +134,11 @@ dependencies:
119
134
  name: rest-client
120
135
  prerelease: false
121
136
  requirement: &id008 !ruby/object:Gem::Requirement
137
+ none: false
122
138
  requirements:
123
139
  - - ">="
124
140
  - !ruby/object:Gem::Version
141
+ hash: 13
125
142
  segments:
126
143
  - 1
127
144
  - 6
@@ -133,9 +150,11 @@ dependencies:
133
150
  name: haml
134
151
  prerelease: false
135
152
  requirement: &id009 !ruby/object:Gem::Requirement
153
+ none: false
136
154
  requirements:
137
155
  - - ">="
138
156
  - !ruby/object:Gem::Version
157
+ hash: 55
139
158
  segments:
140
159
  - 3
141
160
  - 0
@@ -160,7 +179,6 @@ extra_rdoc_files:
160
179
  - lib/scrappy/agent/extractor.rb
161
180
  - lib/scrappy/agent/formats.rb
162
181
  - lib/scrappy/agent/map_reduce.rb
163
- - lib/scrappy/agent/visual_agent.rb
164
182
  - lib/scrappy/repository.rb
165
183
  - lib/scrappy/selectors/base_uri.rb
166
184
  - lib/scrappy/selectors/css.rb
@@ -172,11 +190,10 @@ extra_rdoc_files:
172
190
  - lib/scrappy/selectors/uri_pattern.rb
173
191
  - lib/scrappy/selectors/xpath.rb
174
192
  - lib/scrappy/server/admin.rb
193
+ - lib/scrappy/server/errors.rb
175
194
  - lib/scrappy/server/helpers.rb
176
195
  - lib/scrappy/server/server.rb
177
- - lib/scrappy/shell.rb
178
196
  - lib/scrappy/support.rb
179
- - lib/scrappy/webkit/webkit.rb
180
197
  files:
181
198
  - History.txt
182
199
  - Manifest
@@ -192,7 +209,6 @@ files:
192
209
  - lib/scrappy/agent/extractor.rb
193
210
  - lib/scrappy/agent/formats.rb
194
211
  - lib/scrappy/agent/map_reduce.rb
195
- - lib/scrappy/agent/visual_agent.rb
196
212
  - lib/scrappy/repository.rb
197
213
  - lib/scrappy/selectors/base_uri.rb
198
214
  - lib/scrappy/selectors/css.rb
@@ -204,11 +220,10 @@ files:
204
220
  - lib/scrappy/selectors/uri_pattern.rb
205
221
  - lib/scrappy/selectors/xpath.rb
206
222
  - lib/scrappy/server/admin.rb
223
+ - lib/scrappy/server/errors.rb
207
224
  - lib/scrappy/server/helpers.rb
208
225
  - lib/scrappy/server/server.rb
209
- - lib/scrappy/shell.rb
210
226
  - lib/scrappy/support.rb
211
- - lib/scrappy/webkit/webkit.rb
212
227
  - public/favicon.ico
213
228
  - public/images/logo.png
214
229
  - public/images/logo_tiny.png
@@ -236,16 +251,20 @@ rdoc_options:
236
251
  require_paths:
237
252
  - lib
238
253
  required_ruby_version: !ruby/object:Gem::Requirement
254
+ none: false
239
255
  requirements:
240
256
  - - ">="
241
257
  - !ruby/object:Gem::Version
258
+ hash: 3
242
259
  segments:
243
260
  - 0
244
261
  version: "0"
245
262
  required_rubygems_version: !ruby/object:Gem::Requirement
263
+ none: false
246
264
  requirements:
247
265
  - - ">="
248
266
  - !ruby/object:Gem::Version
267
+ hash: 11
249
268
  segments:
250
269
  - 1
251
270
  - 2
@@ -253,10 +272,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
253
272
  requirements: []
254
273
 
255
274
  rubyforge_project: scrappy
256
- rubygems_version: 1.3.6
275
+ rubygems_version: 1.3.7
257
276
  signing_key:
258
277
  specification_version: 3
259
278
  summary: Web scraper that allows producing RDF data out of plain web pages
260
279
  test_files:
261
- - test/test_scrappy.rb
262
280
  - test/test_helper.rb
281
+ - test/test_scrappy.rb
@@ -1,101 +0,0 @@
1
- # Hack to hide annoying gtk debug messages
2
- old_stderr = $stderr.clone
3
- $stderr.reopen '/dev/null'
4
- require 'scrappy/webkit/webkit'
5
- $stderr = old_stderr
6
-
7
- module Scrappy
8
- class VisualAgent < Agent
9
- attr_reader :visible
10
-
11
- def initialize args={}
12
- super
13
-
14
- @cv = new_cond
15
-
16
- @webview = Gtk::WebKit::WebView.new
17
- @webview.signal_connect("load_finished") { synchronize { @cv.signal } }
18
-
19
- @window = Gtk::Window.new
20
- @window.signal_connect("destroy") { Gtk.main_quit }
21
- @window.add(@webview)
22
- @window.set_size_request(1024, 600)
23
- if args[:window] or (args[:window].nil? and Agent::Options.window)
24
- @window.show_all
25
- @visible = true
26
- end
27
- @mechanize = Mechanize.new
28
- end
29
-
30
- def uri
31
- @uri
32
- end
33
-
34
- def uri= uri
35
- # First, check if the requested uri is a valid HTML page
36
- valid = begin
37
- @mechanize.get(uri).is_a?(Mechanize::Page)
38
- rescue
39
- false
40
- end
41
-
42
- # Open the page in the browser if it's an HTML page
43
- if valid
44
- synchronize do
45
- @webview.open uri.to_s
46
- @cv.wait(60) # 1 minute to open the page
47
- @uri = @webview.uri
48
- end
49
- else
50
- @uri = nil
51
- end
52
- end
53
-
54
- def html_data?
55
- uri.to_s != ""
56
- end
57
-
58
- def html
59
- js "document.documentElement.outerHTML"
60
- end
61
-
62
- def add_visual_data!
63
- js """var items = document.documentElement.getElementsByTagName('*');
64
- var i=0;
65
- for(var i=0; i<items.length; i++) {
66
- var item = items[i];
67
- item.setAttribute('vx', item.offsetLeft);
68
- item.setAttribute('vy', item.offsetTop);
69
- item.setAttribute('vw', item.offsetWidth);
70
- item.setAttribute('vh', item.offsetHeight);
71
- item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'));
72
- var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
73
- if (weight == 'normal') weight = 400;
74
- if (weight == 'bold') weight = 700;
75
- item.setAttribute('vweight', weight);
76
- item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'));
77
- item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'));
78
- }"""
79
- end
80
-
81
- def js code
82
- old_title = @webview.title
83
- @webview.execute_script("document.title = JSON.stringify(eval(#{ActiveSupport::JSON.encode(code)}))")
84
- title = ActiveSupport::JSON.decode(@webview.title)
85
- @webview.execute_script("document.title = #{ActiveSupport::JSON.encode(old_title)}")
86
- title
87
- end
88
-
89
- def load_js url
90
- function = """function include(destination) {
91
- var e=window.document.createElement('script');
92
- e.setAttribute('src',destination);
93
- window.document.body.appendChild(e);
94
- }"""
95
- js function
96
- js "include('#{url}')"
97
- end
98
- end
99
- end
100
-
101
- Thread.new { Gtk.main }
@@ -1,87 +0,0 @@
1
- require 'readline'
2
-
3
- module Scrappy
4
- class Shell
5
- def initialize file=nil
6
- @agent = Agent.create
7
- @file = file
8
- end
9
-
10
- def run
11
- commands = ['get', 'quit', 'help', 'annotate', 'html']
12
-
13
- Readline.completion_append_character = " "
14
- Readline.completer_word_break_characters = ""
15
- Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
16
-
17
- if @file
18
- open(@file, 'r').lines.each do |line|
19
- break if process(line) == :quit
20
- end
21
- else
22
- begin
23
- line = Readline.readline(bash, true)
24
- code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
25
- end while code != :quit
26
- end
27
- end
28
-
29
- protected
30
- def process raw_command
31
- command = raw_command.strip
32
-
33
- code = if command =~ /\Aget\W(.*)\Z/
34
- puts @agent.proxy(:uri=>$1).output
35
- puts
36
- elsif command == 'help'
37
- puts 'Available commands:'
38
- puts ' get URL: Visit the specified URL'
39
- puts ' html: Show HTML code of the current URL'
40
- puts ' annotate: Start the annotation tool that helps building extractors'
41
- puts ' help: Show this information'
42
- puts ' quit: Exit scrappy shell'
43
- puts
44
- elsif command == 'annotate'
45
- if @agent.class.to_s == 'Scrappy::VisualAgent' and @agent.visible
46
- @agent.load_js "http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"
47
- @agent.load_js "http://github.com/josei/scrappy/raw/master/lib/js/annotator.js"
48
- puts "Use the browser's window to annotate resources"
49
- puts
50
- else
51
- puts 'ERROR: Scrappy must be run with -v and -w options to use this feature'
52
- puts
53
- end
54
- elsif command == 'html'
55
- puts @agent.html
56
- puts
57
- elsif command == 'quit'
58
- :quit
59
- elsif command == '' or command[0..0] == '#'
60
- nil
61
- else
62
- puts "ERROR: Unknown command '#{command}'"
63
- puts
64
- end
65
- code
66
- end
67
-
68
- def bash
69
- return '' if Options.quiet
70
- location = if @agent.uri
71
- uri = URI::parse(@agent.uri)
72
- path = uri.path.to_s
73
- path = path[0..0] + "..." + path[-16..-1] if path.size > 20
74
- if uri.query
75
- query = "?" + uri.query
76
- query = "?..." + query[-10..-1] if query.size > 13
77
- else
78
- query = ""
79
- end
80
- "#{uri.base}#{path}#{query}"
81
- else
82
- ''
83
- end
84
- "#{location}$ "
85
- end
86
- end
87
- end
@@ -1,18 +0,0 @@
1
- require 'gtk2'
2
- module Gtk
3
- module WebKit
4
- end
5
- end
6
-
7
- require 'rbwebkitgtk.so'
8
-
9
- class Gtk::WebKit::WebView
10
- alias :load_html_string_no_defaults :load_html_string
11
- def load_html_string(content, base_uri=nil)
12
- load_html_string_no_defaults(content, base_uri)
13
- end
14
-
15
- def mark_text_matches(test, case_sensitive=false, limit=0)
16
- mark_text_matches_with_limit(test, case_sensitive, limit)
17
- end
18
- end