scrappy 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,8 @@
1
+ === 0.3.0 2011-03-11
2
+
3
+ * Removed unused features: shell, browser, scripting
4
+ * Correction in RootSelector
5
+
1
6
  === 0.2.1 2011-03-11
2
7
 
3
8
  * Added a web admin interface mode
data/Manifest CHANGED
@@ -12,7 +12,6 @@ lib/scrappy/agent/dumper.rb
12
12
  lib/scrappy/agent/extractor.rb
13
13
  lib/scrappy/agent/formats.rb
14
14
  lib/scrappy/agent/map_reduce.rb
15
- lib/scrappy/agent/visual_agent.rb
16
15
  lib/scrappy/repository.rb
17
16
  lib/scrappy/selectors/base_uri.rb
18
17
  lib/scrappy/selectors/css.rb
@@ -24,11 +23,10 @@ lib/scrappy/selectors/uri.rb
24
23
  lib/scrappy/selectors/uri_pattern.rb
25
24
  lib/scrappy/selectors/xpath.rb
26
25
  lib/scrappy/server/admin.rb
26
+ lib/scrappy/server/errors.rb
27
27
  lib/scrappy/server/helpers.rb
28
28
  lib/scrappy/server/server.rb
29
- lib/scrappy/shell.rb
30
29
  lib/scrappy/support.rb
31
- lib/scrappy/webkit/webkit.rb
32
30
  public/favicon.ico
33
31
  public/images/logo.png
34
32
  public/images/logo_tiny.png
@@ -58,62 +58,23 @@ scrappy offers many different interfaces to get RDF data from a web page:
58
58
 
59
59
  * Command-line interface:
60
60
 
61
- $ scrappy -g elmundo.es
62
-
63
- * Interactive shell:
64
-
65
- $ scrappy -i
66
- Launching scrappy Shell...
67
- $ get elmundo.es
68
- dc: http://purl.org/dc/elements/1.1/
69
- owl: http://www.w3.org/2002/07/owl#
70
- rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
71
- sc: http://lab.gsi.dit.upm.es/scraping.rdf#
72
- rdfs: http://www.w3.org/2000/01/rdf-schema#
73
- http://www.elmundo.es/elmundo/2010/10/05/gentes/1286310993.html:
74
- dc:description: "Las vacaciones del n\u00famero uno"
75
- dc:title:
76
- "Una suite de 5.000 euros para Nadal en Tailandia"
77
- "Una suite de 5.000 euros para Nadal"
78
- rdf:type: http://rdfs.org/sioc/ns#Post
79
- dc:creator: "Fernando Domingo | John Bali (V\u00eddeo)"
80
- http://www.daml.org/experiment/ontology/location-ont#location:
81
- *:
82
- rdf:label: "Bangkok"
83
- rdf:type: http://www.daml.org/experiment/ontology/location-ont#Location
84
- dc:date: "mi\u00e9rcoles 06/10/2010"
85
- ...
86
-
87
- http://www.elmundo.es$
61
+ $ scrappy -g example.com
88
62
 
89
- * Web Service interface:
63
+ * Web Admin interface:
90
64
 
91
- $ scrappy -s
92
- Launching scrappy Web Server...
93
- ** Starting Mongrel on localhost:3434
65
+ $ scrappy -a
66
+ Launching Scrappy Web Admin (browse http://localhost:3434)...
67
+ == Sinatra/1.1.3 has taken the stage on 3434 for production with backup from Thin
94
68
 
95
69
  Then point your browser to http://localhost:3434 for additional directions.
96
70
 
97
- * Web Proxy interface:
98
-
99
- $ scrappy -S
100
- Launching scrappy Web Proxy...
101
- ** Starting Mongrel on localhost:3434
102
-
103
- Then configure your browser's HTTP proxy to http://localhost:3434 and browse http://www.elmundo.es
104
-
105
- * Scripting (experimental):
106
-
107
- You can create scripts that retrieve many web pages and run them using scrappy.
108
-
109
- #!/usr/bin/scrappy
110
- get elmundo.es
111
- get google.com/search?q=testing
71
+ * Web Service interface:
112
72
 
113
- Then you can run your script from the command line just as any other bash script.
73
+ $ scrappy -s
74
+ Launching Scrappy Web Server...
75
+ == Sinatra/1.1.3 has taken the stage on 3434 for production with backup from Thin
114
76
 
115
- We plan to enable complex operations such as posting forms and definining a useful language
116
- with variables to enable flow control in order to build web service mashups.
77
+ Then use the service in the same way as the Web Admin but for read-only operations.
117
78
 
118
79
  * Ruby interface:
119
80
 
@@ -129,7 +90,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
129
90
  Scrappy::Agent::Options.kb = kb
130
91
 
131
92
  # Create an agent
132
- agent = Scrappy::Agent.create
93
+ agent = Scrappy::Agent.new
133
94
 
134
95
  # Get RDF output
135
96
  output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
@@ -38,17 +38,14 @@ module Scrappy
38
38
  opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
39
39
  opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
40
40
  opts.on('-u', '--debug') { Agent::Options.debug = true }
41
- opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
42
41
  opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
43
42
  opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
44
43
  opts.on('-P P', '--port P') { |p| Options.port = p }
45
44
  opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
46
45
  opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
47
46
  opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
48
- opts.on('-V', '--visual') { Agent::Options.agent = :visual }
49
47
  opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
50
48
  opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
51
- opts.on('-w', '--window') { Agent::Options.window = true }
52
49
  opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
53
50
  opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
54
51
  opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
@@ -60,11 +57,11 @@ module Scrappy
60
57
  onload
61
58
  if Options.uri
62
59
  Options.quiet = true
63
- puts Agent.create.proxy(:http_method=>:get, :uri=>Options.uri).output
60
+ puts Agent.new.proxy(:http_method=>:get, :uri=>Options.uri).output
64
61
  elsif Options.observe
65
- Agent.create.observe(Options.observe)
62
+ Agent.new.observe(Options.observe)
66
63
  elsif Options.admin
67
- puts "Launching Scrappy Admin Web Server (browse http://localhost:#{Options.port})..."
64
+ puts "Launching Scrappy Web Admin (browse http://localhost:#{Options.port})..."
68
65
  require 'scrappy/server/server'
69
66
  Thin::Logging.silent = true
70
67
  Scrappy::Server.register Scrappy::Admin
@@ -76,14 +73,10 @@ module Scrappy
76
73
  Thin::Logging.silent = true
77
74
  Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
78
75
  :base_uri => Options.base_uri
79
- elsif Options.shell
80
- puts "Launching Scrappy Shell..."
81
- require 'scrappy/shell'
82
- Shell.new.run
83
76
  else
84
- require 'scrappy/shell'
85
- Options.quiet = true
86
- Shell.new(@file).run
77
+ output_version
78
+ puts 'To get help use: scrappy -h'
79
+ exit 0
87
80
  end
88
81
  Scrappy::App.quit
89
82
  end
@@ -106,7 +99,7 @@ Usage
106
99
  Options
107
100
  -h, --help Displays help message
108
101
  -v, --version Display the version, then exit
109
- -f, --format Picks output format (json, ejson, rdfxml, ntriples, png)
102
+ -f, --format Picks output format (json, ejson, rdf, ntriples, png)
110
103
  -g, --get URL Gets requested URL
111
104
  -p, --post URL Posts requested URL
112
105
  -c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
@@ -114,16 +107,13 @@ Options
114
107
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
115
108
  -D, --dump Dumps RDF data to disk
116
109
  -u, --debug Shows debugging traces
117
- -i, --interactive Runs interactive shell
118
110
  -o, --observe URLs Observes the specified URLs storing their data into the repository
119
111
  -s, --server [ROOT] Runs web server (optionally specify server's root url)
120
- -S, --proxy-server Runs web proxy
112
+ -a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
121
113
  -P, --port PORT Selects port number (default is 3434)
122
- -V, --visual Uses visual agent (slow)
123
- -t, --time DAYS Returns repository data from the last given minutes
114
+ -t, --time TIME Returns repository data from the last given minutes
124
115
  -r, --reference Outputs referenceable data
125
116
  -R, --reference-all Outputs all HTML referenceable data
126
- -w, --window Shows browser window (requires -v)
127
117
 
128
118
  Authors
129
119
  José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
@@ -17,12 +17,13 @@ require 'scrappy/agent/map_reduce'
17
17
  require 'scrappy/agent/cache'
18
18
  require 'scrappy/agent/dumper'
19
19
  require 'scrappy/agent/formats'
20
+ require 'scrappy/agent/blind_agent'
20
21
  require 'scrappy/agent/agent'
21
22
 
22
23
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
23
24
 
24
25
  module Scrappy
25
- VERSION = '0.2.1'
26
+ VERSION = '0.3.0'
26
27
  end
27
28
 
28
29
  # Require selectors
@@ -4,8 +4,9 @@ module Scrappy
4
4
  include Extractor
5
5
  include MapReduce
6
6
  include Cached
7
+ include BlindAgent
7
8
 
8
- Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
9
+ Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :delay=>0, :workers=>10
9
10
  ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
10
11
  :rdf => 'application/rdf+xml' }
11
12
 
@@ -13,17 +14,7 @@ module Scrappy
13
14
  @pool ||= {}
14
15
  end
15
16
  def self.[] id
16
- pool[id] || Agent.create(:id=>id)
17
- end
18
-
19
- def self.create args={}
20
- if (args[:agent] || Options.agent) == :visual
21
- require 'scrappy/agent/visual_agent'
22
- VisualAgent.new args
23
- else
24
- require 'scrappy/agent/blind_agent'
25
- BlindAgent.new args
26
- end
17
+ pool[id] || Agent.new(:id=>id)
27
18
  end
28
19
 
29
20
  attr_accessor :id, :options, :kb
@@ -160,7 +151,8 @@ module Scrappy
160
151
  end
161
152
 
162
153
  def clean triples
163
- triples.uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
154
+ triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }.
155
+ select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
164
156
  end
165
157
 
166
158
  # Do the extraction using RDF repository
@@ -231,7 +223,6 @@ module Scrappy
231
223
  puts 'done!' if options.debug
232
224
 
233
225
  if self.html_data?
234
- add_visual_data! if options.referenceable # Adds tags including visual information
235
226
  triples = extract(self.uri, html, options.referenceable) # Extract data
236
227
  Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
237
228
  triples
@@ -1,7 +1,7 @@
1
1
  module Scrappy
2
- class BlindAgent < Agent
2
+ module BlindAgent
3
3
  def initialize args={}
4
- super
4
+ super()
5
5
  @mechanize = Mechanize.new
6
6
  @mechanize.max_history = 20
7
7
  end
@@ -36,8 +36,5 @@ module Scrappy
36
36
  def html
37
37
  @mechanize.current_page.root.to_html :encoding=>'UTF-8'
38
38
  end
39
-
40
- def add_visual_data!
41
- end
42
39
  end
43
40
  end
@@ -96,7 +96,7 @@ module Scrappy
96
96
  end
97
97
 
98
98
  def filter selector, doc
99
- if !selector.sc::debug.empty? and options.debug
99
+ if selector.sc::debug.first=="true" and options.debug
100
100
  puts '== DEBUG'
101
101
  puts '== Selector:'
102
102
  puts selector.serialize(:yarf, false)
@@ -109,7 +109,7 @@ module Scrappy
109
109
  # Process selector
110
110
  results = selector_pool(selector).filter doc
111
111
 
112
- if !selector.sc::debug.empty? and options.debug
112
+ if selector.sc::debug.first=="true" and options.debug
113
113
  puts "== No results" if results.empty?
114
114
  results.each_with_index do |result, i|
115
115
  puts "== Result ##{i}:"
@@ -25,9 +25,17 @@ module Scrappy
25
25
  doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
26
26
  doc.text.strip
27
27
  when Node('sc:Html') then
28
- node.to_html
28
+ if node.respond_to? :to_html
29
+ node.to_html
30
+ else
31
+ node.to_s
32
+ end
29
33
  else
30
- node.text
34
+ if node.respond_to? :text
35
+ node.text
36
+ else
37
+ node.to_s
38
+ end
31
39
  end
32
40
  end
33
41
 
@@ -8,7 +8,7 @@ module Sc
8
8
  # Select node's attribute if given
9
9
  sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
10
10
  else
11
- [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], sc::format, doc[:uri]) } ]
11
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
12
12
  end
13
13
  end
14
14
  end
@@ -0,0 +1,13 @@
1
+ module Scrappy
2
+ module Errors
3
+ def self.registered app
4
+ app.error do
5
+ "Internal error"
6
+ end
7
+
8
+ app.not_found do
9
+ "Resource not found"
10
+ end
11
+ end
12
+ end
13
+ end
@@ -3,10 +3,12 @@ require 'thin'
3
3
  require 'haml'
4
4
  require 'scrappy/server/helpers'
5
5
  require 'scrappy/server/admin'
6
+ require 'scrappy/server/errors'
6
7
 
7
8
  module Scrappy
8
9
  class Server < Sinatra::Base
9
10
  helpers JavaScriptHelpers
11
+ register Errors
10
12
 
11
13
  enable :sessions
12
14
  set :root, File.join(File.dirname(__FILE__), '..', '..', '..')
@@ -40,7 +42,7 @@ module Scrappy
40
42
  return @agent if @agent
41
43
  if session[:agent].nil? || session[:token] != SESSION_TOKEN
42
44
  session[:token] = SESSION_TOKEN
43
- session[:agent] = Scrappy::Agent.create.id
45
+ session[:agent] = Scrappy::Agent.new.id
44
46
  end
45
47
  @agent = Scrappy::Agent[session[:agent]]
46
48
  end
@@ -150,7 +150,7 @@ ul.detail li span {
150
150
  display: inline-block;
151
151
  }
152
152
  ul.detail li span.name {
153
- width: 600px;
153
+ width: 550px;
154
154
  overflow-x: hidden;
155
155
  font-family: monospace;
156
156
  font-size: 12px;
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.2.1"
5
+ s.version = "0.3.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
@@ -11,22 +11,22 @@ Gem::Specification.new do |s|
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/scrappy.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/help.haml", "views/home.haml", "views/kb.haml", "views/layout.haml", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/scrappy.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/help.haml", "views/home.haml", "views/kb.haml", "views/layout.haml", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
19
19
  s.require_paths = ["lib"]
20
20
  s.rubyforge_project = %q{scrappy}
21
- s.rubygems_version = %q{1.3.6}
21
+ s.rubygems_version = %q{1.3.7}
22
22
  s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
23
- s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
23
+ s.test_files = ["test/test_helper.rb", "test/test_scrappy.rb"]
24
24
 
25
25
  if s.respond_to? :specification_version then
26
26
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
27
27
  s.specification_version = 3
28
28
 
29
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
29
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
30
30
  s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
31
31
  s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
32
32
  s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
@@ -10,6 +10,6 @@
10
10
  -else
11
11
  =uri
12
12
  -if !uri.include?('*')
13
- -[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['PNG', :png]].reverse.each do |format, format_code|
13
+ -[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['nTriples', :ntriples], ['PNG', :png]].reverse.each do |format, format_code|
14
14
  %span.format
15
15
  %a{:href=>"#{settings.base_uri}/#{format_code}/#{uri}"}=format
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrappy
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 19
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
- - 2
8
- - 1
9
- version: 0.2.1
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
10
11
  platform: ruby
11
12
  authors:
12
13
  - Jose Ignacio
@@ -21,9 +22,11 @@ dependencies:
21
22
  name: activesupport
22
23
  prerelease: false
23
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
24
26
  requirements:
25
27
  - - ">="
26
28
  - !ruby/object:Gem::Version
29
+ hash: 9
27
30
  segments:
28
31
  - 2
29
32
  - 3
@@ -35,9 +38,11 @@ dependencies:
35
38
  name: sinatra
36
39
  prerelease: false
37
40
  requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
38
42
  requirements:
39
43
  - - ">="
40
44
  - !ruby/object:Gem::Version
45
+ hash: 23
41
46
  segments:
42
47
  - 1
43
48
  - 1
@@ -49,9 +54,11 @@ dependencies:
49
54
  name: thin
50
55
  prerelease: false
51
56
  requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
52
58
  requirements:
53
59
  - - ">="
54
60
  - !ruby/object:Gem::Version
61
+ hash: 17
55
62
  segments:
56
63
  - 1
57
64
  - 2
@@ -63,9 +70,11 @@ dependencies:
63
70
  name: nokogiri
64
71
  prerelease: false
65
72
  requirement: &id004 !ruby/object:Gem::Requirement
73
+ none: false
66
74
  requirements:
67
75
  - - ">="
68
76
  - !ruby/object:Gem::Version
77
+ hash: 5
69
78
  segments:
70
79
  - 1
71
80
  - 4
@@ -77,9 +86,11 @@ dependencies:
77
86
  name: mechanize
78
87
  prerelease: false
79
88
  requirement: &id005 !ruby/object:Gem::Requirement
89
+ none: false
80
90
  requirements:
81
91
  - - ">="
82
92
  - !ruby/object:Gem::Version
93
+ hash: 23
83
94
  segments:
84
95
  - 1
85
96
  - 0
@@ -91,9 +102,11 @@ dependencies:
91
102
  name: lightrdf
92
103
  prerelease: false
93
104
  requirement: &id006 !ruby/object:Gem::Requirement
105
+ none: false
94
106
  requirements:
95
107
  - - ">="
96
108
  - !ruby/object:Gem::Version
109
+ hash: 21
97
110
  segments:
98
111
  - 0
99
112
  - 2
@@ -105,9 +118,11 @@ dependencies:
105
118
  name: i18n
106
119
  prerelease: false
107
120
  requirement: &id007 !ruby/object:Gem::Requirement
121
+ none: false
108
122
  requirements:
109
123
  - - ">="
110
124
  - !ruby/object:Gem::Version
125
+ hash: 11
111
126
  segments:
112
127
  - 0
113
128
  - 4
@@ -119,9 +134,11 @@ dependencies:
119
134
  name: rest-client
120
135
  prerelease: false
121
136
  requirement: &id008 !ruby/object:Gem::Requirement
137
+ none: false
122
138
  requirements:
123
139
  - - ">="
124
140
  - !ruby/object:Gem::Version
141
+ hash: 13
125
142
  segments:
126
143
  - 1
127
144
  - 6
@@ -133,9 +150,11 @@ dependencies:
133
150
  name: haml
134
151
  prerelease: false
135
152
  requirement: &id009 !ruby/object:Gem::Requirement
153
+ none: false
136
154
  requirements:
137
155
  - - ">="
138
156
  - !ruby/object:Gem::Version
157
+ hash: 55
139
158
  segments:
140
159
  - 3
141
160
  - 0
@@ -160,7 +179,6 @@ extra_rdoc_files:
160
179
  - lib/scrappy/agent/extractor.rb
161
180
  - lib/scrappy/agent/formats.rb
162
181
  - lib/scrappy/agent/map_reduce.rb
163
- - lib/scrappy/agent/visual_agent.rb
164
182
  - lib/scrappy/repository.rb
165
183
  - lib/scrappy/selectors/base_uri.rb
166
184
  - lib/scrappy/selectors/css.rb
@@ -172,11 +190,10 @@ extra_rdoc_files:
172
190
  - lib/scrappy/selectors/uri_pattern.rb
173
191
  - lib/scrappy/selectors/xpath.rb
174
192
  - lib/scrappy/server/admin.rb
193
+ - lib/scrappy/server/errors.rb
175
194
  - lib/scrappy/server/helpers.rb
176
195
  - lib/scrappy/server/server.rb
177
- - lib/scrappy/shell.rb
178
196
  - lib/scrappy/support.rb
179
- - lib/scrappy/webkit/webkit.rb
180
197
  files:
181
198
  - History.txt
182
199
  - Manifest
@@ -192,7 +209,6 @@ files:
192
209
  - lib/scrappy/agent/extractor.rb
193
210
  - lib/scrappy/agent/formats.rb
194
211
  - lib/scrappy/agent/map_reduce.rb
195
- - lib/scrappy/agent/visual_agent.rb
196
212
  - lib/scrappy/repository.rb
197
213
  - lib/scrappy/selectors/base_uri.rb
198
214
  - lib/scrappy/selectors/css.rb
@@ -204,11 +220,10 @@ files:
204
220
  - lib/scrappy/selectors/uri_pattern.rb
205
221
  - lib/scrappy/selectors/xpath.rb
206
222
  - lib/scrappy/server/admin.rb
223
+ - lib/scrappy/server/errors.rb
207
224
  - lib/scrappy/server/helpers.rb
208
225
  - lib/scrappy/server/server.rb
209
- - lib/scrappy/shell.rb
210
226
  - lib/scrappy/support.rb
211
- - lib/scrappy/webkit/webkit.rb
212
227
  - public/favicon.ico
213
228
  - public/images/logo.png
214
229
  - public/images/logo_tiny.png
@@ -236,16 +251,20 @@ rdoc_options:
236
251
  require_paths:
237
252
  - lib
238
253
  required_ruby_version: !ruby/object:Gem::Requirement
254
+ none: false
239
255
  requirements:
240
256
  - - ">="
241
257
  - !ruby/object:Gem::Version
258
+ hash: 3
242
259
  segments:
243
260
  - 0
244
261
  version: "0"
245
262
  required_rubygems_version: !ruby/object:Gem::Requirement
263
+ none: false
246
264
  requirements:
247
265
  - - ">="
248
266
  - !ruby/object:Gem::Version
267
+ hash: 11
249
268
  segments:
250
269
  - 1
251
270
  - 2
@@ -253,10 +272,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
253
272
  requirements: []
254
273
 
255
274
  rubyforge_project: scrappy
256
- rubygems_version: 1.3.6
275
+ rubygems_version: 1.3.7
257
276
  signing_key:
258
277
  specification_version: 3
259
278
  summary: Web scraper that allows producing RDF data out of plain web pages
260
279
  test_files:
261
- - test/test_scrappy.rb
262
280
  - test/test_helper.rb
281
+ - test/test_scrappy.rb
@@ -1,101 +0,0 @@
1
- # Hack to hide annoying gtk debug messages
2
- old_stderr = $stderr.clone
3
- $stderr.reopen '/dev/null'
4
- require 'scrappy/webkit/webkit'
5
- $stderr = old_stderr
6
-
7
- module Scrappy
8
- class VisualAgent < Agent
9
- attr_reader :visible
10
-
11
- def initialize args={}
12
- super
13
-
14
- @cv = new_cond
15
-
16
- @webview = Gtk::WebKit::WebView.new
17
- @webview.signal_connect("load_finished") { synchronize { @cv.signal } }
18
-
19
- @window = Gtk::Window.new
20
- @window.signal_connect("destroy") { Gtk.main_quit }
21
- @window.add(@webview)
22
- @window.set_size_request(1024, 600)
23
- if args[:window] or (args[:window].nil? and Agent::Options.window)
24
- @window.show_all
25
- @visible = true
26
- end
27
- @mechanize = Mechanize.new
28
- end
29
-
30
- def uri
31
- @uri
32
- end
33
-
34
- def uri= uri
35
- # First, check if the requested uri is a valid HTML page
36
- valid = begin
37
- @mechanize.get(uri).is_a?(Mechanize::Page)
38
- rescue
39
- false
40
- end
41
-
42
- # Open the page in the browser if it's an HTML page
43
- if valid
44
- synchronize do
45
- @webview.open uri.to_s
46
- @cv.wait(60) # 1 minute to open the page
47
- @uri = @webview.uri
48
- end
49
- else
50
- @uri = nil
51
- end
52
- end
53
-
54
- def html_data?
55
- uri.to_s != ""
56
- end
57
-
58
- def html
59
- js "document.documentElement.outerHTML"
60
- end
61
-
62
- def add_visual_data!
63
- js """var items = document.documentElement.getElementsByTagName('*');
64
- var i=0;
65
- for(var i=0; i<items.length; i++) {
66
- var item = items[i];
67
- item.setAttribute('vx', item.offsetLeft);
68
- item.setAttribute('vy', item.offsetTop);
69
- item.setAttribute('vw', item.offsetWidth);
70
- item.setAttribute('vh', item.offsetHeight);
71
- item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'));
72
- var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
73
- if (weight == 'normal') weight = 400;
74
- if (weight == 'bold') weight = 700;
75
- item.setAttribute('vweight', weight);
76
- item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'));
77
- item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'));
78
- }"""
79
- end
80
-
81
- def js code
82
- old_title = @webview.title
83
- @webview.execute_script("document.title = JSON.stringify(eval(#{ActiveSupport::JSON.encode(code)}))")
84
- title = ActiveSupport::JSON.decode(@webview.title)
85
- @webview.execute_script("document.title = #{ActiveSupport::JSON.encode(old_title)}")
86
- title
87
- end
88
-
89
- def load_js url
90
- function = """function include(destination) {
91
- var e=window.document.createElement('script');
92
- e.setAttribute('src',destination);
93
- window.document.body.appendChild(e);
94
- }"""
95
- js function
96
- js "include('#{url}')"
97
- end
98
- end
99
- end
100
-
101
- Thread.new { Gtk.main }
@@ -1,87 +0,0 @@
1
- require 'readline'
2
-
3
- module Scrappy
4
- class Shell
5
- def initialize file=nil
6
- @agent = Agent.create
7
- @file = file
8
- end
9
-
10
- def run
11
- commands = ['get', 'quit', 'help', 'annotate', 'html']
12
-
13
- Readline.completion_append_character = " "
14
- Readline.completer_word_break_characters = ""
15
- Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
16
-
17
- if @file
18
- open(@file, 'r').lines.each do |line|
19
- break if process(line) == :quit
20
- end
21
- else
22
- begin
23
- line = Readline.readline(bash, true)
24
- code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
25
- end while code != :quit
26
- end
27
- end
28
-
29
- protected
30
- def process raw_command
31
- command = raw_command.strip
32
-
33
- code = if command =~ /\Aget\W(.*)\Z/
34
- puts @agent.proxy(:uri=>$1).output
35
- puts
36
- elsif command == 'help'
37
- puts 'Available commands:'
38
- puts ' get URL: Visit the specified URL'
39
- puts ' html: Show HTML code of the current URL'
40
- puts ' annotate: Start the annotation tool that helps building extractors'
41
- puts ' help: Show this information'
42
- puts ' quit: Exit scrappy shell'
43
- puts
44
- elsif command == 'annotate'
45
- if @agent.class.to_s == 'Scrappy::VisualAgent' and @agent.visible
46
- @agent.load_js "http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"
47
- @agent.load_js "http://github.com/josei/scrappy/raw/master/lib/js/annotator.js"
48
- puts "Use the browser's window to annotate resources"
49
- puts
50
- else
51
- puts 'ERROR: Scrappy must be run with -v and -w options to use this feature'
52
- puts
53
- end
54
- elsif command == 'html'
55
- puts @agent.html
56
- puts
57
- elsif command == 'quit'
58
- :quit
59
- elsif command == '' or command[0..0] == '#'
60
- nil
61
- else
62
- puts "ERROR: Unknown command '#{command}'"
63
- puts
64
- end
65
- code
66
- end
67
-
68
- def bash
69
- return '' if Options.quiet
70
- location = if @agent.uri
71
- uri = URI::parse(@agent.uri)
72
- path = uri.path.to_s
73
- path = path[0..0] + "..." + path[-16..-1] if path.size > 20
74
- if uri.query
75
- query = "?" + uri.query
76
- query = "?..." + query[-10..-1] if query.size > 13
77
- else
78
- query = ""
79
- end
80
- "#{uri.base}#{path}#{query}"
81
- else
82
- ''
83
- end
84
- "#{location}$ "
85
- end
86
- end
87
- end
@@ -1,18 +0,0 @@
1
- require 'gtk2'
2
- module Gtk
3
- module WebKit
4
- end
5
- end
6
-
7
- require 'rbwebkitgtk.so'
8
-
9
- class Gtk::WebKit::WebView
10
- alias :load_html_string_no_defaults :load_html_string
11
- def load_html_string(content, base_uri=nil)
12
- load_html_string_no_defaults(content, base_uri)
13
- end
14
-
15
- def mark_text_matches(test, case_sensitive=false, limit=0)
16
- mark_text_matches_with_limit(test, case_sensitive, limit)
17
- end
18
- end