scrappy 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/Manifest +1 -3
- data/README.rdoc +11 -50
- data/bin/scrappy +9 -19
- data/lib/scrappy.rb +2 -1
- data/lib/scrappy/agent/agent.rb +5 -14
- data/lib/scrappy/agent/blind_agent.rb +2 -5
- data/lib/scrappy/agent/extractor.rb +2 -2
- data/lib/scrappy/agent/formats.rb +10 -2
- data/lib/scrappy/selectors/root.rb +1 -1
- data/lib/scrappy/server/errors.rb +13 -0
- data/lib/scrappy/server/server.rb +3 -1
- data/public/stylesheets/application.css +1 -1
- data/scrappy.gemspec +6 -6
- data/views/kb.haml +1 -1
- metadata +30 -11
- data/lib/scrappy/agent/visual_agent.rb +0 -101
- data/lib/scrappy/shell.rb +0 -87
- data/lib/scrappy/webkit/webkit.rb +0 -18
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -12,7 +12,6 @@ lib/scrappy/agent/dumper.rb
|
|
12
12
|
lib/scrappy/agent/extractor.rb
|
13
13
|
lib/scrappy/agent/formats.rb
|
14
14
|
lib/scrappy/agent/map_reduce.rb
|
15
|
-
lib/scrappy/agent/visual_agent.rb
|
16
15
|
lib/scrappy/repository.rb
|
17
16
|
lib/scrappy/selectors/base_uri.rb
|
18
17
|
lib/scrappy/selectors/css.rb
|
@@ -24,11 +23,10 @@ lib/scrappy/selectors/uri.rb
|
|
24
23
|
lib/scrappy/selectors/uri_pattern.rb
|
25
24
|
lib/scrappy/selectors/xpath.rb
|
26
25
|
lib/scrappy/server/admin.rb
|
26
|
+
lib/scrappy/server/errors.rb
|
27
27
|
lib/scrappy/server/helpers.rb
|
28
28
|
lib/scrappy/server/server.rb
|
29
|
-
lib/scrappy/shell.rb
|
30
29
|
lib/scrappy/support.rb
|
31
|
-
lib/scrappy/webkit/webkit.rb
|
32
30
|
public/favicon.ico
|
33
31
|
public/images/logo.png
|
34
32
|
public/images/logo_tiny.png
|
data/README.rdoc
CHANGED
@@ -58,62 +58,23 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
58
58
|
|
59
59
|
* Command-line interface:
|
60
60
|
|
61
|
-
$ scrappy -g
|
62
|
-
|
63
|
-
* Interactive shell:
|
64
|
-
|
65
|
-
$ scrappy -i
|
66
|
-
Launching scrappy Shell...
|
67
|
-
$ get elmundo.es
|
68
|
-
dc: http://purl.org/dc/elements/1.1/
|
69
|
-
owl: http://www.w3.org/2002/07/owl#
|
70
|
-
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
|
71
|
-
sc: http://lab.gsi.dit.upm.es/scraping.rdf#
|
72
|
-
rdfs: http://www.w3.org/2000/01/rdf-schema#
|
73
|
-
http://www.elmundo.es/elmundo/2010/10/05/gentes/1286310993.html:
|
74
|
-
dc:description: "Las vacaciones del n\u00famero uno"
|
75
|
-
dc:title:
|
76
|
-
"Una suite de 5.000 euros para Nadal en Tailandia"
|
77
|
-
"Una suite de 5.000 euros para Nadal"
|
78
|
-
rdf:type: http://rdfs.org/sioc/ns#Post
|
79
|
-
dc:creator: "Fernando Domingo | John Bali (V\u00eddeo)"
|
80
|
-
http://www.daml.org/experiment/ontology/location-ont#location:
|
81
|
-
*:
|
82
|
-
rdf:label: "Bangkok"
|
83
|
-
rdf:type: http://www.daml.org/experiment/ontology/location-ont#Location
|
84
|
-
dc:date: "mi\u00e9rcoles 06/10/2010"
|
85
|
-
...
|
86
|
-
|
87
|
-
http://www.elmundo.es$
|
61
|
+
$ scrappy -g example.com
|
88
62
|
|
89
|
-
* Web
|
63
|
+
* Web Admin interface:
|
90
64
|
|
91
|
-
$ scrappy -
|
92
|
-
Launching
|
93
|
-
|
65
|
+
$ scrappy -a
|
66
|
+
Launching Scrappy Web Admin (browse http://localhost:3434)...
|
67
|
+
== Sinatra/1.1.3 has taken the stage on 3434 for production with backup from Thin
|
94
68
|
|
95
69
|
Then point your browser to http://localhost:3434 for additional directions.
|
96
70
|
|
97
|
-
* Web
|
98
|
-
|
99
|
-
$ scrappy -S
|
100
|
-
Launching scrappy Web Proxy...
|
101
|
-
** Starting Mongrel on localhost:3434
|
102
|
-
|
103
|
-
Then configure your browser's HTTP proxy to http://localhost:3434 and browse http://www.elmundo.es
|
104
|
-
|
105
|
-
* Scripting (experimental):
|
106
|
-
|
107
|
-
You can create scripts that retrieve many web pages and run them using scrappy.
|
108
|
-
|
109
|
-
#!/usr/bin/scrappy
|
110
|
-
get elmundo.es
|
111
|
-
get google.com/search?q=testing
|
71
|
+
* Web Service interface:
|
112
72
|
|
113
|
-
|
73
|
+
$ scrappy -s
|
74
|
+
Launching Scrappy Web Server...
|
75
|
+
== Sinatra/1.1.3 has taken the stage on 3434 for production with backup from Thin
|
114
76
|
|
115
|
-
|
116
|
-
with variables to enable flow control in order to build web service mashups.
|
77
|
+
Then use the service in the same way as the Web Admin but for read-only operations.
|
117
78
|
|
118
79
|
* Ruby interface:
|
119
80
|
|
@@ -129,7 +90,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
129
90
|
Scrappy::Agent::Options.kb = kb
|
130
91
|
|
131
92
|
# Create an agent
|
132
|
-
agent = Scrappy::Agent.
|
93
|
+
agent = Scrappy::Agent.new
|
133
94
|
|
134
95
|
# Get RDF output
|
135
96
|
output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
|
data/bin/scrappy
CHANGED
@@ -38,17 +38,14 @@ module Scrappy
|
|
38
38
|
opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
|
39
39
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
40
40
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
41
|
-
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
42
41
|
opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
|
43
42
|
opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
|
44
43
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
45
44
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
46
45
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
47
46
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
48
|
-
opts.on('-V', '--visual') { Agent::Options.agent = :visual }
|
49
47
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
50
48
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
51
|
-
opts.on('-w', '--window') { Agent::Options.window = true }
|
52
49
|
opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
|
53
50
|
opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
|
54
51
|
opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
|
@@ -60,11 +57,11 @@ module Scrappy
|
|
60
57
|
onload
|
61
58
|
if Options.uri
|
62
59
|
Options.quiet = true
|
63
|
-
puts Agent.
|
60
|
+
puts Agent.new.proxy(:http_method=>:get, :uri=>Options.uri).output
|
64
61
|
elsif Options.observe
|
65
|
-
Agent.
|
62
|
+
Agent.new.observe(Options.observe)
|
66
63
|
elsif Options.admin
|
67
|
-
puts "Launching Scrappy
|
64
|
+
puts "Launching Scrappy Web Admin (browse http://localhost:#{Options.port})..."
|
68
65
|
require 'scrappy/server/server'
|
69
66
|
Thin::Logging.silent = true
|
70
67
|
Scrappy::Server.register Scrappy::Admin
|
@@ -76,14 +73,10 @@ module Scrappy
|
|
76
73
|
Thin::Logging.silent = true
|
77
74
|
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
78
75
|
:base_uri => Options.base_uri
|
79
|
-
elsif Options.shell
|
80
|
-
puts "Launching Scrappy Shell..."
|
81
|
-
require 'scrappy/shell'
|
82
|
-
Shell.new.run
|
83
76
|
else
|
84
|
-
|
85
|
-
|
86
|
-
|
77
|
+
output_version
|
78
|
+
puts 'To get help use: scrappy -h'
|
79
|
+
exit 0
|
87
80
|
end
|
88
81
|
Scrappy::App.quit
|
89
82
|
end
|
@@ -106,7 +99,7 @@ Usage
|
|
106
99
|
Options
|
107
100
|
-h, --help Displays help message
|
108
101
|
-v, --version Display the version, then exit
|
109
|
-
-f, --format Picks output format (json, ejson,
|
102
|
+
-f, --format Picks output format (json, ejson, rdf, ntriples, png)
|
110
103
|
-g, --get URL Gets requested URL
|
111
104
|
-p, --post URL Posts requested URL
|
112
105
|
-c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
|
@@ -114,16 +107,13 @@ Options
|
|
114
107
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
115
108
|
-D, --dump Dumps RDF data to disk
|
116
109
|
-u, --debug Shows debugging traces
|
117
|
-
-i, --interactive Runs interactive shell
|
118
110
|
-o, --observe URLs Observes the specified URLs storing their data into the repository
|
119
111
|
-s, --server [ROOT] Runs web server (optionally specify server's root url)
|
120
|
-
-
|
112
|
+
-a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
|
121
113
|
-P, --port PORT Selects port number (default is 3434)
|
122
|
-
-
|
123
|
-
-t, --time DAYS Returns repository data from the last given minutes
|
114
|
+
-t, --time TIME Returns repository data from the last given minutes
|
124
115
|
-r, --reference Outputs referenceable data
|
125
116
|
-R, --reference-all Outputs all HTML referenceable data
|
126
|
-
-w, --window Shows browser window (requires -v)
|
127
117
|
|
128
118
|
Authors
|
129
119
|
José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
|
data/lib/scrappy.rb
CHANGED
@@ -17,12 +17,13 @@ require 'scrappy/agent/map_reduce'
|
|
17
17
|
require 'scrappy/agent/cache'
|
18
18
|
require 'scrappy/agent/dumper'
|
19
19
|
require 'scrappy/agent/formats'
|
20
|
+
require 'scrappy/agent/blind_agent'
|
20
21
|
require 'scrappy/agent/agent'
|
21
22
|
|
22
23
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
23
24
|
|
24
25
|
module Scrappy
|
25
|
-
VERSION = '0.
|
26
|
+
VERSION = '0.3.0'
|
26
27
|
end
|
27
28
|
|
28
29
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -4,8 +4,9 @@ module Scrappy
|
|
4
4
|
include Extractor
|
5
5
|
include MapReduce
|
6
6
|
include Cached
|
7
|
+
include BlindAgent
|
7
8
|
|
8
|
-
Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :
|
9
|
+
Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :delay=>0, :workers=>10
|
9
10
|
ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
|
10
11
|
:rdf => 'application/rdf+xml' }
|
11
12
|
|
@@ -13,17 +14,7 @@ module Scrappy
|
|
13
14
|
@pool ||= {}
|
14
15
|
end
|
15
16
|
def self.[] id
|
16
|
-
pool[id] || Agent.
|
17
|
-
end
|
18
|
-
|
19
|
-
def self.create args={}
|
20
|
-
if (args[:agent] || Options.agent) == :visual
|
21
|
-
require 'scrappy/agent/visual_agent'
|
22
|
-
VisualAgent.new args
|
23
|
-
else
|
24
|
-
require 'scrappy/agent/blind_agent'
|
25
|
-
BlindAgent.new args
|
26
|
-
end
|
17
|
+
pool[id] || Agent.new(:id=>id)
|
27
18
|
end
|
28
19
|
|
29
20
|
attr_accessor :id, :options, :kb
|
@@ -160,7 +151,8 @@ module Scrappy
|
|
160
151
|
end
|
161
152
|
|
162
153
|
def clean triples
|
163
|
-
triples.uniq.select { |s,p,o| p!=
|
154
|
+
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }.
|
155
|
+
select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
164
156
|
end
|
165
157
|
|
166
158
|
# Do the extraction using RDF repository
|
@@ -231,7 +223,6 @@ module Scrappy
|
|
231
223
|
puts 'done!' if options.debug
|
232
224
|
|
233
225
|
if self.html_data?
|
234
|
-
add_visual_data! if options.referenceable # Adds tags including visual information
|
235
226
|
triples = extract(self.uri, html, options.referenceable) # Extract data
|
236
227
|
Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
|
237
228
|
triples
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Scrappy
|
2
|
-
|
2
|
+
module BlindAgent
|
3
3
|
def initialize args={}
|
4
|
-
super
|
4
|
+
super()
|
5
5
|
@mechanize = Mechanize.new
|
6
6
|
@mechanize.max_history = 20
|
7
7
|
end
|
@@ -36,8 +36,5 @@ module Scrappy
|
|
36
36
|
def html
|
37
37
|
@mechanize.current_page.root.to_html :encoding=>'UTF-8'
|
38
38
|
end
|
39
|
-
|
40
|
-
def add_visual_data!
|
41
|
-
end
|
42
39
|
end
|
43
40
|
end
|
@@ -96,7 +96,7 @@ module Scrappy
|
|
96
96
|
end
|
97
97
|
|
98
98
|
def filter selector, doc
|
99
|
-
if
|
99
|
+
if selector.sc::debug.first=="true" and options.debug
|
100
100
|
puts '== DEBUG'
|
101
101
|
puts '== Selector:'
|
102
102
|
puts selector.serialize(:yarf, false)
|
@@ -109,7 +109,7 @@ module Scrappy
|
|
109
109
|
# Process selector
|
110
110
|
results = selector_pool(selector).filter doc
|
111
111
|
|
112
|
-
if
|
112
|
+
if selector.sc::debug.first=="true" and options.debug
|
113
113
|
puts "== No results" if results.empty?
|
114
114
|
results.each_with_index do |result, i|
|
115
115
|
puts "== Result ##{i}:"
|
@@ -25,9 +25,17 @@ module Scrappy
|
|
25
25
|
doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
|
26
26
|
doc.text.strip
|
27
27
|
when Node('sc:Html') then
|
28
|
-
node.to_html
|
28
|
+
if node.respond_to? :to_html
|
29
|
+
node.to_html
|
30
|
+
else
|
31
|
+
node.to_s
|
32
|
+
end
|
29
33
|
else
|
30
|
-
node.text
|
34
|
+
if node.respond_to? :text
|
35
|
+
node.text
|
36
|
+
else
|
37
|
+
node.to_s
|
38
|
+
end
|
31
39
|
end
|
32
40
|
end
|
33
41
|
|
@@ -8,7 +8,7 @@ module Sc
|
|
8
8
|
# Select node's attribute if given
|
9
9
|
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
10
10
|
else
|
11
|
-
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:
|
11
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
|
12
12
|
end
|
13
13
|
end
|
14
14
|
end
|
@@ -3,10 +3,12 @@ require 'thin'
|
|
3
3
|
require 'haml'
|
4
4
|
require 'scrappy/server/helpers'
|
5
5
|
require 'scrappy/server/admin'
|
6
|
+
require 'scrappy/server/errors'
|
6
7
|
|
7
8
|
module Scrappy
|
8
9
|
class Server < Sinatra::Base
|
9
10
|
helpers JavaScriptHelpers
|
11
|
+
register Errors
|
10
12
|
|
11
13
|
enable :sessions
|
12
14
|
set :root, File.join(File.dirname(__FILE__), '..', '..', '..')
|
@@ -40,7 +42,7 @@ module Scrappy
|
|
40
42
|
return @agent if @agent
|
41
43
|
if session[:agent].nil? || session[:token] != SESSION_TOKEN
|
42
44
|
session[:token] = SESSION_TOKEN
|
43
|
-
session[:agent] = Scrappy::Agent.
|
45
|
+
session[:agent] = Scrappy::Agent.new.id
|
44
46
|
end
|
45
47
|
@agent = Scrappy::Agent[session[:agent]]
|
46
48
|
end
|
data/scrappy.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.3.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
@@ -11,22 +11,22 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/scrappy.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/help.haml", "views/home.haml", "views/kb.haml", "views/layout.haml", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
s.rubyforge_project = %q{scrappy}
|
21
|
-
s.rubygems_version = %q{1.3.
|
21
|
+
s.rubygems_version = %q{1.3.7}
|
22
22
|
s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
|
23
|
-
s.test_files = ["test/
|
23
|
+
s.test_files = ["test/test_helper.rb", "test/test_scrappy.rb"]
|
24
24
|
|
25
25
|
if s.respond_to? :specification_version then
|
26
26
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
27
27
|
s.specification_version = 3
|
28
28
|
|
29
|
-
if Gem::Version.new(Gem::
|
29
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
30
30
|
s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
|
31
31
|
s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
|
32
32
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
data/views/kb.haml
CHANGED
@@ -10,6 +10,6 @@
|
|
10
10
|
-else
|
11
11
|
=uri
|
12
12
|
-if !uri.include?('*')
|
13
|
-
-[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['PNG', :png]].reverse.each do |format, format_code|
|
13
|
+
-[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['nTriples', :ntriples], ['PNG', :png]].reverse.each do |format, format_code|
|
14
14
|
%span.format
|
15
15
|
%a{:href=>"#{settings.base_uri}/#{format_code}/#{uri}"}=format
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrappy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Jose Ignacio
|
@@ -21,9 +22,11 @@ dependencies:
|
|
21
22
|
name: activesupport
|
22
23
|
prerelease: false
|
23
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
24
26
|
requirements:
|
25
27
|
- - ">="
|
26
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 9
|
27
30
|
segments:
|
28
31
|
- 2
|
29
32
|
- 3
|
@@ -35,9 +38,11 @@ dependencies:
|
|
35
38
|
name: sinatra
|
36
39
|
prerelease: false
|
37
40
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
38
42
|
requirements:
|
39
43
|
- - ">="
|
40
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 23
|
41
46
|
segments:
|
42
47
|
- 1
|
43
48
|
- 1
|
@@ -49,9 +54,11 @@ dependencies:
|
|
49
54
|
name: thin
|
50
55
|
prerelease: false
|
51
56
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
52
58
|
requirements:
|
53
59
|
- - ">="
|
54
60
|
- !ruby/object:Gem::Version
|
61
|
+
hash: 17
|
55
62
|
segments:
|
56
63
|
- 1
|
57
64
|
- 2
|
@@ -63,9 +70,11 @@ dependencies:
|
|
63
70
|
name: nokogiri
|
64
71
|
prerelease: false
|
65
72
|
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
66
74
|
requirements:
|
67
75
|
- - ">="
|
68
76
|
- !ruby/object:Gem::Version
|
77
|
+
hash: 5
|
69
78
|
segments:
|
70
79
|
- 1
|
71
80
|
- 4
|
@@ -77,9 +86,11 @@ dependencies:
|
|
77
86
|
name: mechanize
|
78
87
|
prerelease: false
|
79
88
|
requirement: &id005 !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
80
90
|
requirements:
|
81
91
|
- - ">="
|
82
92
|
- !ruby/object:Gem::Version
|
93
|
+
hash: 23
|
83
94
|
segments:
|
84
95
|
- 1
|
85
96
|
- 0
|
@@ -91,9 +102,11 @@ dependencies:
|
|
91
102
|
name: lightrdf
|
92
103
|
prerelease: false
|
93
104
|
requirement: &id006 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
94
106
|
requirements:
|
95
107
|
- - ">="
|
96
108
|
- !ruby/object:Gem::Version
|
109
|
+
hash: 21
|
97
110
|
segments:
|
98
111
|
- 0
|
99
112
|
- 2
|
@@ -105,9 +118,11 @@ dependencies:
|
|
105
118
|
name: i18n
|
106
119
|
prerelease: false
|
107
120
|
requirement: &id007 !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
108
122
|
requirements:
|
109
123
|
- - ">="
|
110
124
|
- !ruby/object:Gem::Version
|
125
|
+
hash: 11
|
111
126
|
segments:
|
112
127
|
- 0
|
113
128
|
- 4
|
@@ -119,9 +134,11 @@ dependencies:
|
|
119
134
|
name: rest-client
|
120
135
|
prerelease: false
|
121
136
|
requirement: &id008 !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
122
138
|
requirements:
|
123
139
|
- - ">="
|
124
140
|
- !ruby/object:Gem::Version
|
141
|
+
hash: 13
|
125
142
|
segments:
|
126
143
|
- 1
|
127
144
|
- 6
|
@@ -133,9 +150,11 @@ dependencies:
|
|
133
150
|
name: haml
|
134
151
|
prerelease: false
|
135
152
|
requirement: &id009 !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
136
154
|
requirements:
|
137
155
|
- - ">="
|
138
156
|
- !ruby/object:Gem::Version
|
157
|
+
hash: 55
|
139
158
|
segments:
|
140
159
|
- 3
|
141
160
|
- 0
|
@@ -160,7 +179,6 @@ extra_rdoc_files:
|
|
160
179
|
- lib/scrappy/agent/extractor.rb
|
161
180
|
- lib/scrappy/agent/formats.rb
|
162
181
|
- lib/scrappy/agent/map_reduce.rb
|
163
|
-
- lib/scrappy/agent/visual_agent.rb
|
164
182
|
- lib/scrappy/repository.rb
|
165
183
|
- lib/scrappy/selectors/base_uri.rb
|
166
184
|
- lib/scrappy/selectors/css.rb
|
@@ -172,11 +190,10 @@ extra_rdoc_files:
|
|
172
190
|
- lib/scrappy/selectors/uri_pattern.rb
|
173
191
|
- lib/scrappy/selectors/xpath.rb
|
174
192
|
- lib/scrappy/server/admin.rb
|
193
|
+
- lib/scrappy/server/errors.rb
|
175
194
|
- lib/scrappy/server/helpers.rb
|
176
195
|
- lib/scrappy/server/server.rb
|
177
|
-
- lib/scrappy/shell.rb
|
178
196
|
- lib/scrappy/support.rb
|
179
|
-
- lib/scrappy/webkit/webkit.rb
|
180
197
|
files:
|
181
198
|
- History.txt
|
182
199
|
- Manifest
|
@@ -192,7 +209,6 @@ files:
|
|
192
209
|
- lib/scrappy/agent/extractor.rb
|
193
210
|
- lib/scrappy/agent/formats.rb
|
194
211
|
- lib/scrappy/agent/map_reduce.rb
|
195
|
-
- lib/scrappy/agent/visual_agent.rb
|
196
212
|
- lib/scrappy/repository.rb
|
197
213
|
- lib/scrappy/selectors/base_uri.rb
|
198
214
|
- lib/scrappy/selectors/css.rb
|
@@ -204,11 +220,10 @@ files:
|
|
204
220
|
- lib/scrappy/selectors/uri_pattern.rb
|
205
221
|
- lib/scrappy/selectors/xpath.rb
|
206
222
|
- lib/scrappy/server/admin.rb
|
223
|
+
- lib/scrappy/server/errors.rb
|
207
224
|
- lib/scrappy/server/helpers.rb
|
208
225
|
- lib/scrappy/server/server.rb
|
209
|
-
- lib/scrappy/shell.rb
|
210
226
|
- lib/scrappy/support.rb
|
211
|
-
- lib/scrappy/webkit/webkit.rb
|
212
227
|
- public/favicon.ico
|
213
228
|
- public/images/logo.png
|
214
229
|
- public/images/logo_tiny.png
|
@@ -236,16 +251,20 @@ rdoc_options:
|
|
236
251
|
require_paths:
|
237
252
|
- lib
|
238
253
|
required_ruby_version: !ruby/object:Gem::Requirement
|
254
|
+
none: false
|
239
255
|
requirements:
|
240
256
|
- - ">="
|
241
257
|
- !ruby/object:Gem::Version
|
258
|
+
hash: 3
|
242
259
|
segments:
|
243
260
|
- 0
|
244
261
|
version: "0"
|
245
262
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
263
|
+
none: false
|
246
264
|
requirements:
|
247
265
|
- - ">="
|
248
266
|
- !ruby/object:Gem::Version
|
267
|
+
hash: 11
|
249
268
|
segments:
|
250
269
|
- 1
|
251
270
|
- 2
|
@@ -253,10 +272,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
253
272
|
requirements: []
|
254
273
|
|
255
274
|
rubyforge_project: scrappy
|
256
|
-
rubygems_version: 1.3.
|
275
|
+
rubygems_version: 1.3.7
|
257
276
|
signing_key:
|
258
277
|
specification_version: 3
|
259
278
|
summary: Web scraper that allows producing RDF data out of plain web pages
|
260
279
|
test_files:
|
261
|
-
- test/test_scrappy.rb
|
262
280
|
- test/test_helper.rb
|
281
|
+
- test/test_scrappy.rb
|
@@ -1,101 +0,0 @@
|
|
1
|
-
# Hack to hide annoying gtk debug messages
|
2
|
-
old_stderr = $stderr.clone
|
3
|
-
$stderr.reopen '/dev/null'
|
4
|
-
require 'scrappy/webkit/webkit'
|
5
|
-
$stderr = old_stderr
|
6
|
-
|
7
|
-
module Scrappy
|
8
|
-
class VisualAgent < Agent
|
9
|
-
attr_reader :visible
|
10
|
-
|
11
|
-
def initialize args={}
|
12
|
-
super
|
13
|
-
|
14
|
-
@cv = new_cond
|
15
|
-
|
16
|
-
@webview = Gtk::WebKit::WebView.new
|
17
|
-
@webview.signal_connect("load_finished") { synchronize { @cv.signal } }
|
18
|
-
|
19
|
-
@window = Gtk::Window.new
|
20
|
-
@window.signal_connect("destroy") { Gtk.main_quit }
|
21
|
-
@window.add(@webview)
|
22
|
-
@window.set_size_request(1024, 600)
|
23
|
-
if args[:window] or (args[:window].nil? and Agent::Options.window)
|
24
|
-
@window.show_all
|
25
|
-
@visible = true
|
26
|
-
end
|
27
|
-
@mechanize = Mechanize.new
|
28
|
-
end
|
29
|
-
|
30
|
-
def uri
|
31
|
-
@uri
|
32
|
-
end
|
33
|
-
|
34
|
-
def uri= uri
|
35
|
-
# First, check if the requested uri is a valid HTML page
|
36
|
-
valid = begin
|
37
|
-
@mechanize.get(uri).is_a?(Mechanize::Page)
|
38
|
-
rescue
|
39
|
-
false
|
40
|
-
end
|
41
|
-
|
42
|
-
# Open the page in the browser if it's an HTML page
|
43
|
-
if valid
|
44
|
-
synchronize do
|
45
|
-
@webview.open uri.to_s
|
46
|
-
@cv.wait(60) # 1 minute to open the page
|
47
|
-
@uri = @webview.uri
|
48
|
-
end
|
49
|
-
else
|
50
|
-
@uri = nil
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def html_data?
|
55
|
-
uri.to_s != ""
|
56
|
-
end
|
57
|
-
|
58
|
-
def html
|
59
|
-
js "document.documentElement.outerHTML"
|
60
|
-
end
|
61
|
-
|
62
|
-
def add_visual_data!
|
63
|
-
js """var items = document.documentElement.getElementsByTagName('*');
|
64
|
-
var i=0;
|
65
|
-
for(var i=0; i<items.length; i++) {
|
66
|
-
var item = items[i];
|
67
|
-
item.setAttribute('vx', item.offsetLeft);
|
68
|
-
item.setAttribute('vy', item.offsetTop);
|
69
|
-
item.setAttribute('vw', item.offsetWidth);
|
70
|
-
item.setAttribute('vh', item.offsetHeight);
|
71
|
-
item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'));
|
72
|
-
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
73
|
-
if (weight == 'normal') weight = 400;
|
74
|
-
if (weight == 'bold') weight = 700;
|
75
|
-
item.setAttribute('vweight', weight);
|
76
|
-
item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'));
|
77
|
-
item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'));
|
78
|
-
}"""
|
79
|
-
end
|
80
|
-
|
81
|
-
def js code
|
82
|
-
old_title = @webview.title
|
83
|
-
@webview.execute_script("document.title = JSON.stringify(eval(#{ActiveSupport::JSON.encode(code)}))")
|
84
|
-
title = ActiveSupport::JSON.decode(@webview.title)
|
85
|
-
@webview.execute_script("document.title = #{ActiveSupport::JSON.encode(old_title)}")
|
86
|
-
title
|
87
|
-
end
|
88
|
-
|
89
|
-
def load_js url
|
90
|
-
function = """function include(destination) {
|
91
|
-
var e=window.document.createElement('script');
|
92
|
-
e.setAttribute('src',destination);
|
93
|
-
window.document.body.appendChild(e);
|
94
|
-
}"""
|
95
|
-
js function
|
96
|
-
js "include('#{url}')"
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
Thread.new { Gtk.main }
|
data/lib/scrappy/shell.rb
DELETED
@@ -1,87 +0,0 @@
|
|
1
|
-
require 'readline'
|
2
|
-
|
3
|
-
module Scrappy
|
4
|
-
class Shell
|
5
|
-
def initialize file=nil
|
6
|
-
@agent = Agent.create
|
7
|
-
@file = file
|
8
|
-
end
|
9
|
-
|
10
|
-
def run
|
11
|
-
commands = ['get', 'quit', 'help', 'annotate', 'html']
|
12
|
-
|
13
|
-
Readline.completion_append_character = " "
|
14
|
-
Readline.completer_word_break_characters = ""
|
15
|
-
Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
|
16
|
-
|
17
|
-
if @file
|
18
|
-
open(@file, 'r').lines.each do |line|
|
19
|
-
break if process(line) == :quit
|
20
|
-
end
|
21
|
-
else
|
22
|
-
begin
|
23
|
-
line = Readline.readline(bash, true)
|
24
|
-
code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
|
25
|
-
end while code != :quit
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
protected
|
30
|
-
def process raw_command
|
31
|
-
command = raw_command.strip
|
32
|
-
|
33
|
-
code = if command =~ /\Aget\W(.*)\Z/
|
34
|
-
puts @agent.proxy(:uri=>$1).output
|
35
|
-
puts
|
36
|
-
elsif command == 'help'
|
37
|
-
puts 'Available commands:'
|
38
|
-
puts ' get URL: Visit the specified URL'
|
39
|
-
puts ' html: Show HTML code of the current URL'
|
40
|
-
puts ' annotate: Start the annotation tool that helps building extractors'
|
41
|
-
puts ' help: Show this information'
|
42
|
-
puts ' quit: Exit scrappy shell'
|
43
|
-
puts
|
44
|
-
elsif command == 'annotate'
|
45
|
-
if @agent.class.to_s == 'Scrappy::VisualAgent' and @agent.visible
|
46
|
-
@agent.load_js "http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"
|
47
|
-
@agent.load_js "http://github.com/josei/scrappy/raw/master/lib/js/annotator.js"
|
48
|
-
puts "Use the browser's window to annotate resources"
|
49
|
-
puts
|
50
|
-
else
|
51
|
-
puts 'ERROR: Scrappy must be run with -v and -w options to use this feature'
|
52
|
-
puts
|
53
|
-
end
|
54
|
-
elsif command == 'html'
|
55
|
-
puts @agent.html
|
56
|
-
puts
|
57
|
-
elsif command == 'quit'
|
58
|
-
:quit
|
59
|
-
elsif command == '' or command[0..0] == '#'
|
60
|
-
nil
|
61
|
-
else
|
62
|
-
puts "ERROR: Unknown command '#{command}'"
|
63
|
-
puts
|
64
|
-
end
|
65
|
-
code
|
66
|
-
end
|
67
|
-
|
68
|
-
def bash
|
69
|
-
return '' if Options.quiet
|
70
|
-
location = if @agent.uri
|
71
|
-
uri = URI::parse(@agent.uri)
|
72
|
-
path = uri.path.to_s
|
73
|
-
path = path[0..0] + "..." + path[-16..-1] if path.size > 20
|
74
|
-
if uri.query
|
75
|
-
query = "?" + uri.query
|
76
|
-
query = "?..." + query[-10..-1] if query.size > 13
|
77
|
-
else
|
78
|
-
query = ""
|
79
|
-
end
|
80
|
-
"#{uri.base}#{path}#{query}"
|
81
|
-
else
|
82
|
-
''
|
83
|
-
end
|
84
|
-
"#{location}$ "
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
@@ -1,18 +0,0 @@
|
|
1
|
-
require 'gtk2'
|
2
|
-
module Gtk
|
3
|
-
module WebKit
|
4
|
-
end
|
5
|
-
end
|
6
|
-
|
7
|
-
require 'rbwebkitgtk.so'
|
8
|
-
|
9
|
-
class Gtk::WebKit::WebView
|
10
|
-
alias :load_html_string_no_defaults :load_html_string
|
11
|
-
def load_html_string(content, base_uri=nil)
|
12
|
-
load_html_string_no_defaults(content, base_uri)
|
13
|
-
end
|
14
|
-
|
15
|
-
def mark_text_matches(test, case_sensitive=false, limit=0)
|
16
|
-
mark_text_matches_with_limit(test, case_sensitive, limit)
|
17
|
-
end
|
18
|
-
end
|