scrappy 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/Manifest +1 -3
- data/README.rdoc +11 -50
- data/bin/scrappy +9 -19
- data/lib/scrappy.rb +2 -1
- data/lib/scrappy/agent/agent.rb +5 -14
- data/lib/scrappy/agent/blind_agent.rb +2 -5
- data/lib/scrappy/agent/extractor.rb +2 -2
- data/lib/scrappy/agent/formats.rb +10 -2
- data/lib/scrappy/selectors/root.rb +1 -1
- data/lib/scrappy/server/errors.rb +13 -0
- data/lib/scrappy/server/server.rb +3 -1
- data/public/stylesheets/application.css +1 -1
- data/scrappy.gemspec +6 -6
- data/views/kb.haml +1 -1
- metadata +30 -11
- data/lib/scrappy/agent/visual_agent.rb +0 -101
- data/lib/scrappy/shell.rb +0 -87
- data/lib/scrappy/webkit/webkit.rb +0 -18
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -12,7 +12,6 @@ lib/scrappy/agent/dumper.rb
|
|
12
12
|
lib/scrappy/agent/extractor.rb
|
13
13
|
lib/scrappy/agent/formats.rb
|
14
14
|
lib/scrappy/agent/map_reduce.rb
|
15
|
-
lib/scrappy/agent/visual_agent.rb
|
16
15
|
lib/scrappy/repository.rb
|
17
16
|
lib/scrappy/selectors/base_uri.rb
|
18
17
|
lib/scrappy/selectors/css.rb
|
@@ -24,11 +23,10 @@ lib/scrappy/selectors/uri.rb
|
|
24
23
|
lib/scrappy/selectors/uri_pattern.rb
|
25
24
|
lib/scrappy/selectors/xpath.rb
|
26
25
|
lib/scrappy/server/admin.rb
|
26
|
+
lib/scrappy/server/errors.rb
|
27
27
|
lib/scrappy/server/helpers.rb
|
28
28
|
lib/scrappy/server/server.rb
|
29
|
-
lib/scrappy/shell.rb
|
30
29
|
lib/scrappy/support.rb
|
31
|
-
lib/scrappy/webkit/webkit.rb
|
32
30
|
public/favicon.ico
|
33
31
|
public/images/logo.png
|
34
32
|
public/images/logo_tiny.png
|
data/README.rdoc
CHANGED
@@ -58,62 +58,23 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
58
58
|
|
59
59
|
* Command-line interface:
|
60
60
|
|
61
|
-
$ scrappy -g
|
62
|
-
|
63
|
-
* Interactive shell:
|
64
|
-
|
65
|
-
$ scrappy -i
|
66
|
-
Launching scrappy Shell...
|
67
|
-
$ get elmundo.es
|
68
|
-
dc: http://purl.org/dc/elements/1.1/
|
69
|
-
owl: http://www.w3.org/2002/07/owl#
|
70
|
-
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
|
71
|
-
sc: http://lab.gsi.dit.upm.es/scraping.rdf#
|
72
|
-
rdfs: http://www.w3.org/2000/01/rdf-schema#
|
73
|
-
http://www.elmundo.es/elmundo/2010/10/05/gentes/1286310993.html:
|
74
|
-
dc:description: "Las vacaciones del n\u00famero uno"
|
75
|
-
dc:title:
|
76
|
-
"Una suite de 5.000 euros para Nadal en Tailandia"
|
77
|
-
"Una suite de 5.000 euros para Nadal"
|
78
|
-
rdf:type: http://rdfs.org/sioc/ns#Post
|
79
|
-
dc:creator: "Fernando Domingo | John Bali (V\u00eddeo)"
|
80
|
-
http://www.daml.org/experiment/ontology/location-ont#location:
|
81
|
-
*:
|
82
|
-
rdf:label: "Bangkok"
|
83
|
-
rdf:type: http://www.daml.org/experiment/ontology/location-ont#Location
|
84
|
-
dc:date: "mi\u00e9rcoles 06/10/2010"
|
85
|
-
...
|
86
|
-
|
87
|
-
http://www.elmundo.es$
|
61
|
+
$ scrappy -g example.com
|
88
62
|
|
89
|
-
* Web
|
63
|
+
* Web Admin interface:
|
90
64
|
|
91
|
-
$ scrappy -
|
92
|
-
Launching
|
93
|
-
|
65
|
+
$ scrappy -a
|
66
|
+
Launching Scrappy Web Admin (browse http://localhost:3434)...
|
67
|
+
== Sinatra/1.1.3 has taken the stage on 3434 for production with backup from Thin
|
94
68
|
|
95
69
|
Then point your browser to http://localhost:3434 for additional directions.
|
96
70
|
|
97
|
-
* Web
|
98
|
-
|
99
|
-
$ scrappy -S
|
100
|
-
Launching scrappy Web Proxy...
|
101
|
-
** Starting Mongrel on localhost:3434
|
102
|
-
|
103
|
-
Then configure your browser's HTTP proxy to http://localhost:3434 and browse http://www.elmundo.es
|
104
|
-
|
105
|
-
* Scripting (experimental):
|
106
|
-
|
107
|
-
You can create scripts that retrieve many web pages and run them using scrappy.
|
108
|
-
|
109
|
-
#!/usr/bin/scrappy
|
110
|
-
get elmundo.es
|
111
|
-
get google.com/search?q=testing
|
71
|
+
* Web Service interface:
|
112
72
|
|
113
|
-
|
73
|
+
$ scrappy -s
|
74
|
+
Launching Scrappy Web Server...
|
75
|
+
== Sinatra/1.1.3 has taken the stage on 3434 for production with backup from Thin
|
114
76
|
|
115
|
-
|
116
|
-
with variables to enable flow control in order to build web service mashups.
|
77
|
+
Then use the service in the same way as the Web Admin but for read-only operations.
|
117
78
|
|
118
79
|
* Ruby interface:
|
119
80
|
|
@@ -129,7 +90,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
129
90
|
Scrappy::Agent::Options.kb = kb
|
130
91
|
|
131
92
|
# Create an agent
|
132
|
-
agent = Scrappy::Agent.
|
93
|
+
agent = Scrappy::Agent.new
|
133
94
|
|
134
95
|
# Get RDF output
|
135
96
|
output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
|
data/bin/scrappy
CHANGED
@@ -38,17 +38,14 @@ module Scrappy
|
|
38
38
|
opts.on('-p URI', '--post URI') { |uri| Options.uri = uri; Options.http_method=:post }
|
39
39
|
opts.on('-D', '--dump') { Agent::Options.dump = true; Agent::Options.format = :rdf }
|
40
40
|
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
41
|
-
opts.on('-i', '--interactive') { Options.shell = true; Agent::Options.format_header = false }
|
42
41
|
opts.on('-s', '--server [BASE_URI]') { |uri| Options.server = true; Options.base_uri = uri }
|
43
42
|
opts.on('-a', '--admin [BASE_URI]') { |uri| Options.admin = true; Options.base_uri = uri }
|
44
43
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
45
44
|
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
46
45
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
47
46
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
48
|
-
opts.on('-V', '--visual') { Agent::Options.agent = :visual }
|
49
47
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
50
48
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
51
|
-
opts.on('-w', '--window') { Agent::Options.window = true }
|
52
49
|
opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
|
53
50
|
opts.on('-t TIME', '--time TIME') { |t| Agent::Options.time = t.to_i*60 } # converts minutes to seconds
|
54
51
|
opts.on('-o URIs', '--observe URIs') { |uris| Options.observe = uris.split(',') }
|
@@ -60,11 +57,11 @@ module Scrappy
|
|
60
57
|
onload
|
61
58
|
if Options.uri
|
62
59
|
Options.quiet = true
|
63
|
-
puts Agent.
|
60
|
+
puts Agent.new.proxy(:http_method=>:get, :uri=>Options.uri).output
|
64
61
|
elsif Options.observe
|
65
|
-
Agent.
|
62
|
+
Agent.new.observe(Options.observe)
|
66
63
|
elsif Options.admin
|
67
|
-
puts "Launching Scrappy
|
64
|
+
puts "Launching Scrappy Web Admin (browse http://localhost:#{Options.port})..."
|
68
65
|
require 'scrappy/server/server'
|
69
66
|
Thin::Logging.silent = true
|
70
67
|
Scrappy::Server.register Scrappy::Admin
|
@@ -76,14 +73,10 @@ module Scrappy
|
|
76
73
|
Thin::Logging.silent = true
|
77
74
|
Scrappy::Server.run! :host => 'localhost', :port => Options.port, :environment => :production,
|
78
75
|
:base_uri => Options.base_uri
|
79
|
-
elsif Options.shell
|
80
|
-
puts "Launching Scrappy Shell..."
|
81
|
-
require 'scrappy/shell'
|
82
|
-
Shell.new.run
|
83
76
|
else
|
84
|
-
|
85
|
-
|
86
|
-
|
77
|
+
output_version
|
78
|
+
puts 'To get help use: scrappy -h'
|
79
|
+
exit 0
|
87
80
|
end
|
88
81
|
Scrappy::App.quit
|
89
82
|
end
|
@@ -106,7 +99,7 @@ Usage
|
|
106
99
|
Options
|
107
100
|
-h, --help Displays help message
|
108
101
|
-v, --version Display the version, then exit
|
109
|
-
-f, --format Picks output format (json, ejson,
|
102
|
+
-f, --format Picks output format (json, ejson, rdf, ntriples, png)
|
110
103
|
-g, --get URL Gets requested URL
|
111
104
|
-p, --post URL Posts requested URL
|
112
105
|
-c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
|
@@ -114,16 +107,13 @@ Options
|
|
114
107
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
115
108
|
-D, --dump Dumps RDF data to disk
|
116
109
|
-u, --debug Shows debugging traces
|
117
|
-
-i, --interactive Runs interactive shell
|
118
110
|
-o, --observe URLs Observes the specified URLs storing their data into the repository
|
119
111
|
-s, --server [ROOT] Runs web server (optionally specify server's root url)
|
120
|
-
-
|
112
|
+
-a, --admin [ROOT] Runs admin web server (optionally specify server's root url)
|
121
113
|
-P, --port PORT Selects port number (default is 3434)
|
122
|
-
-
|
123
|
-
-t, --time DAYS Returns repository data from the last given minutes
|
114
|
+
-t, --time TIME Returns repository data from the last given minutes
|
124
115
|
-r, --reference Outputs referenceable data
|
125
116
|
-R, --reference-all Outputs all HTML referenceable data
|
126
|
-
-w, --window Shows browser window (requires -v)
|
127
117
|
|
128
118
|
Authors
|
129
119
|
José Ignacio Fernández, Alberto Mardomingo, Jacobo Blasco
|
data/lib/scrappy.rb
CHANGED
@@ -17,12 +17,13 @@ require 'scrappy/agent/map_reduce'
|
|
17
17
|
require 'scrappy/agent/cache'
|
18
18
|
require 'scrappy/agent/dumper'
|
19
19
|
require 'scrappy/agent/formats'
|
20
|
+
require 'scrappy/agent/blind_agent'
|
20
21
|
require 'scrappy/agent/agent'
|
21
22
|
|
22
23
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
23
24
|
|
24
25
|
module Scrappy
|
25
|
-
VERSION = '0.
|
26
|
+
VERSION = '0.3.0'
|
26
27
|
end
|
27
28
|
|
28
29
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -4,8 +4,9 @@ module Scrappy
|
|
4
4
|
include Extractor
|
5
5
|
include MapReduce
|
6
6
|
include Cached
|
7
|
+
include BlindAgent
|
7
8
|
|
8
|
-
Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :
|
9
|
+
Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :delay=>0, :workers=>10
|
9
10
|
ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
|
10
11
|
:rdf => 'application/rdf+xml' }
|
11
12
|
|
@@ -13,17 +14,7 @@ module Scrappy
|
|
13
14
|
@pool ||= {}
|
14
15
|
end
|
15
16
|
def self.[] id
|
16
|
-
pool[id] || Agent.
|
17
|
-
end
|
18
|
-
|
19
|
-
def self.create args={}
|
20
|
-
if (args[:agent] || Options.agent) == :visual
|
21
|
-
require 'scrappy/agent/visual_agent'
|
22
|
-
VisualAgent.new args
|
23
|
-
else
|
24
|
-
require 'scrappy/agent/blind_agent'
|
25
|
-
BlindAgent.new args
|
26
|
-
end
|
17
|
+
pool[id] || Agent.new(:id=>id)
|
27
18
|
end
|
28
19
|
|
29
20
|
attr_accessor :id, :options, :kb
|
@@ -160,7 +151,8 @@ module Scrappy
|
|
160
151
|
end
|
161
152
|
|
162
153
|
def clean triples
|
163
|
-
triples.uniq.select { |s,p,o| p!=
|
154
|
+
triples.uniq.select { |s,p,o| p!=ID('rdf:type') or ![ID('sc:Index'), ID('sc:Page')].include?(o) }.
|
155
|
+
select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) }
|
164
156
|
end
|
165
157
|
|
166
158
|
# Do the extraction using RDF repository
|
@@ -231,7 +223,6 @@ module Scrappy
|
|
231
223
|
puts 'done!' if options.debug
|
232
224
|
|
233
225
|
if self.html_data?
|
234
|
-
add_visual_data! if options.referenceable # Adds tags including visual information
|
235
226
|
triples = extract(self.uri, html, options.referenceable) # Extract data
|
236
227
|
Dumper.dump self.uri, clean(triples), options.format if options.dump # Dump results to disk
|
237
228
|
triples
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Scrappy
|
2
|
-
|
2
|
+
module BlindAgent
|
3
3
|
def initialize args={}
|
4
|
-
super
|
4
|
+
super()
|
5
5
|
@mechanize = Mechanize.new
|
6
6
|
@mechanize.max_history = 20
|
7
7
|
end
|
@@ -36,8 +36,5 @@ module Scrappy
|
|
36
36
|
def html
|
37
37
|
@mechanize.current_page.root.to_html :encoding=>'UTF-8'
|
38
38
|
end
|
39
|
-
|
40
|
-
def add_visual_data!
|
41
|
-
end
|
42
39
|
end
|
43
40
|
end
|
@@ -96,7 +96,7 @@ module Scrappy
|
|
96
96
|
end
|
97
97
|
|
98
98
|
def filter selector, doc
|
99
|
-
if
|
99
|
+
if selector.sc::debug.first=="true" and options.debug
|
100
100
|
puts '== DEBUG'
|
101
101
|
puts '== Selector:'
|
102
102
|
puts selector.serialize(:yarf, false)
|
@@ -109,7 +109,7 @@ module Scrappy
|
|
109
109
|
# Process selector
|
110
110
|
results = selector_pool(selector).filter doc
|
111
111
|
|
112
|
-
if
|
112
|
+
if selector.sc::debug.first=="true" and options.debug
|
113
113
|
puts "== No results" if results.empty?
|
114
114
|
results.each_with_index do |result, i|
|
115
115
|
puts "== Result ##{i}:"
|
@@ -25,9 +25,17 @@ module Scrappy
|
|
25
25
|
doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
|
26
26
|
doc.text.strip
|
27
27
|
when Node('sc:Html') then
|
28
|
-
node.to_html
|
28
|
+
if node.respond_to? :to_html
|
29
|
+
node.to_html
|
30
|
+
else
|
31
|
+
node.to_s
|
32
|
+
end
|
29
33
|
else
|
30
|
-
node.text
|
34
|
+
if node.respond_to? :text
|
35
|
+
node.text
|
36
|
+
else
|
37
|
+
node.to_s
|
38
|
+
end
|
31
39
|
end
|
32
40
|
end
|
33
41
|
|
@@ -8,7 +8,7 @@ module Sc
|
|
8
8
|
# Select node's attribute if given
|
9
9
|
sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
10
10
|
else
|
11
|
-
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:
|
11
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:value], sc::format, doc[:uri]) } ]
|
12
12
|
end
|
13
13
|
end
|
14
14
|
end
|
@@ -3,10 +3,12 @@ require 'thin'
|
|
3
3
|
require 'haml'
|
4
4
|
require 'scrappy/server/helpers'
|
5
5
|
require 'scrappy/server/admin'
|
6
|
+
require 'scrappy/server/errors'
|
6
7
|
|
7
8
|
module Scrappy
|
8
9
|
class Server < Sinatra::Base
|
9
10
|
helpers JavaScriptHelpers
|
11
|
+
register Errors
|
10
12
|
|
11
13
|
enable :sessions
|
12
14
|
set :root, File.join(File.dirname(__FILE__), '..', '..', '..')
|
@@ -40,7 +42,7 @@ module Scrappy
|
|
40
42
|
return @agent if @agent
|
41
43
|
if session[:agent].nil? || session[:token] != SESSION_TOKEN
|
42
44
|
session[:token] = SESSION_TOKEN
|
43
|
-
session[:agent] = Scrappy::Agent.
|
45
|
+
session[:agent] = Scrappy::Agent.new.id
|
44
46
|
end
|
45
47
|
@agent = Scrappy::Agent[session[:agent]]
|
46
48
|
end
|
data/scrappy.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.3.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
@@ -11,22 +11,22 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/repository.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server/admin.rb", "lib/scrappy/server/errors.rb", "lib/scrappy/server/helpers.rb", "lib/scrappy/server/server.rb", "lib/scrappy/support.rb", "public/favicon.ico", "public/images/logo.png", "public/images/logo_tiny.png", "public/javascripts/scrappy.js", "public/stylesheets/application.css", "test/test_helper.rb", "test/test_scrappy.rb", "views/help.haml", "views/home.haml", "views/kb.haml", "views/layout.haml", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
s.rubyforge_project = %q{scrappy}
|
21
|
-
s.rubygems_version = %q{1.3.
|
21
|
+
s.rubygems_version = %q{1.3.7}
|
22
22
|
s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
|
23
|
-
s.test_files = ["test/
|
23
|
+
s.test_files = ["test/test_helper.rb", "test/test_scrappy.rb"]
|
24
24
|
|
25
25
|
if s.respond_to? :specification_version then
|
26
26
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
27
27
|
s.specification_version = 3
|
28
28
|
|
29
|
-
if Gem::Version.new(Gem::
|
29
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
30
30
|
s.add_runtime_dependency(%q<activesupport>, [">= 2.3.5"])
|
31
31
|
s.add_runtime_dependency(%q<sinatra>, [">= 1.1.2"])
|
32
32
|
s.add_runtime_dependency(%q<thin>, [">= 1.2.7"])
|
data/views/kb.haml
CHANGED
@@ -10,6 +10,6 @@
|
|
10
10
|
-else
|
11
11
|
=uri
|
12
12
|
-if !uri.include?('*')
|
13
|
-
-[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['PNG', :png]].reverse.each do |format, format_code|
|
13
|
+
-[['RDF', :rdf], ['JSON', :ejson], ['YARF', :yarf], ['nTriples', :ntriples], ['PNG', :png]].reverse.each do |format, format_code|
|
14
14
|
%span.format
|
15
15
|
%a{:href=>"#{settings.base_uri}/#{format_code}/#{uri}"}=format
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrappy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Jose Ignacio
|
@@ -21,9 +22,11 @@ dependencies:
|
|
21
22
|
name: activesupport
|
22
23
|
prerelease: false
|
23
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
24
26
|
requirements:
|
25
27
|
- - ">="
|
26
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 9
|
27
30
|
segments:
|
28
31
|
- 2
|
29
32
|
- 3
|
@@ -35,9 +38,11 @@ dependencies:
|
|
35
38
|
name: sinatra
|
36
39
|
prerelease: false
|
37
40
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
38
42
|
requirements:
|
39
43
|
- - ">="
|
40
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 23
|
41
46
|
segments:
|
42
47
|
- 1
|
43
48
|
- 1
|
@@ -49,9 +54,11 @@ dependencies:
|
|
49
54
|
name: thin
|
50
55
|
prerelease: false
|
51
56
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
52
58
|
requirements:
|
53
59
|
- - ">="
|
54
60
|
- !ruby/object:Gem::Version
|
61
|
+
hash: 17
|
55
62
|
segments:
|
56
63
|
- 1
|
57
64
|
- 2
|
@@ -63,9 +70,11 @@ dependencies:
|
|
63
70
|
name: nokogiri
|
64
71
|
prerelease: false
|
65
72
|
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
66
74
|
requirements:
|
67
75
|
- - ">="
|
68
76
|
- !ruby/object:Gem::Version
|
77
|
+
hash: 5
|
69
78
|
segments:
|
70
79
|
- 1
|
71
80
|
- 4
|
@@ -77,9 +86,11 @@ dependencies:
|
|
77
86
|
name: mechanize
|
78
87
|
prerelease: false
|
79
88
|
requirement: &id005 !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
80
90
|
requirements:
|
81
91
|
- - ">="
|
82
92
|
- !ruby/object:Gem::Version
|
93
|
+
hash: 23
|
83
94
|
segments:
|
84
95
|
- 1
|
85
96
|
- 0
|
@@ -91,9 +102,11 @@ dependencies:
|
|
91
102
|
name: lightrdf
|
92
103
|
prerelease: false
|
93
104
|
requirement: &id006 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
94
106
|
requirements:
|
95
107
|
- - ">="
|
96
108
|
- !ruby/object:Gem::Version
|
109
|
+
hash: 21
|
97
110
|
segments:
|
98
111
|
- 0
|
99
112
|
- 2
|
@@ -105,9 +118,11 @@ dependencies:
|
|
105
118
|
name: i18n
|
106
119
|
prerelease: false
|
107
120
|
requirement: &id007 !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
108
122
|
requirements:
|
109
123
|
- - ">="
|
110
124
|
- !ruby/object:Gem::Version
|
125
|
+
hash: 11
|
111
126
|
segments:
|
112
127
|
- 0
|
113
128
|
- 4
|
@@ -119,9 +134,11 @@ dependencies:
|
|
119
134
|
name: rest-client
|
120
135
|
prerelease: false
|
121
136
|
requirement: &id008 !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
122
138
|
requirements:
|
123
139
|
- - ">="
|
124
140
|
- !ruby/object:Gem::Version
|
141
|
+
hash: 13
|
125
142
|
segments:
|
126
143
|
- 1
|
127
144
|
- 6
|
@@ -133,9 +150,11 @@ dependencies:
|
|
133
150
|
name: haml
|
134
151
|
prerelease: false
|
135
152
|
requirement: &id009 !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
136
154
|
requirements:
|
137
155
|
- - ">="
|
138
156
|
- !ruby/object:Gem::Version
|
157
|
+
hash: 55
|
139
158
|
segments:
|
140
159
|
- 3
|
141
160
|
- 0
|
@@ -160,7 +179,6 @@ extra_rdoc_files:
|
|
160
179
|
- lib/scrappy/agent/extractor.rb
|
161
180
|
- lib/scrappy/agent/formats.rb
|
162
181
|
- lib/scrappy/agent/map_reduce.rb
|
163
|
-
- lib/scrappy/agent/visual_agent.rb
|
164
182
|
- lib/scrappy/repository.rb
|
165
183
|
- lib/scrappy/selectors/base_uri.rb
|
166
184
|
- lib/scrappy/selectors/css.rb
|
@@ -172,11 +190,10 @@ extra_rdoc_files:
|
|
172
190
|
- lib/scrappy/selectors/uri_pattern.rb
|
173
191
|
- lib/scrappy/selectors/xpath.rb
|
174
192
|
- lib/scrappy/server/admin.rb
|
193
|
+
- lib/scrappy/server/errors.rb
|
175
194
|
- lib/scrappy/server/helpers.rb
|
176
195
|
- lib/scrappy/server/server.rb
|
177
|
-
- lib/scrappy/shell.rb
|
178
196
|
- lib/scrappy/support.rb
|
179
|
-
- lib/scrappy/webkit/webkit.rb
|
180
197
|
files:
|
181
198
|
- History.txt
|
182
199
|
- Manifest
|
@@ -192,7 +209,6 @@ files:
|
|
192
209
|
- lib/scrappy/agent/extractor.rb
|
193
210
|
- lib/scrappy/agent/formats.rb
|
194
211
|
- lib/scrappy/agent/map_reduce.rb
|
195
|
-
- lib/scrappy/agent/visual_agent.rb
|
196
212
|
- lib/scrappy/repository.rb
|
197
213
|
- lib/scrappy/selectors/base_uri.rb
|
198
214
|
- lib/scrappy/selectors/css.rb
|
@@ -204,11 +220,10 @@ files:
|
|
204
220
|
- lib/scrappy/selectors/uri_pattern.rb
|
205
221
|
- lib/scrappy/selectors/xpath.rb
|
206
222
|
- lib/scrappy/server/admin.rb
|
223
|
+
- lib/scrappy/server/errors.rb
|
207
224
|
- lib/scrappy/server/helpers.rb
|
208
225
|
- lib/scrappy/server/server.rb
|
209
|
-
- lib/scrappy/shell.rb
|
210
226
|
- lib/scrappy/support.rb
|
211
|
-
- lib/scrappy/webkit/webkit.rb
|
212
227
|
- public/favicon.ico
|
213
228
|
- public/images/logo.png
|
214
229
|
- public/images/logo_tiny.png
|
@@ -236,16 +251,20 @@ rdoc_options:
|
|
236
251
|
require_paths:
|
237
252
|
- lib
|
238
253
|
required_ruby_version: !ruby/object:Gem::Requirement
|
254
|
+
none: false
|
239
255
|
requirements:
|
240
256
|
- - ">="
|
241
257
|
- !ruby/object:Gem::Version
|
258
|
+
hash: 3
|
242
259
|
segments:
|
243
260
|
- 0
|
244
261
|
version: "0"
|
245
262
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
263
|
+
none: false
|
246
264
|
requirements:
|
247
265
|
- - ">="
|
248
266
|
- !ruby/object:Gem::Version
|
267
|
+
hash: 11
|
249
268
|
segments:
|
250
269
|
- 1
|
251
270
|
- 2
|
@@ -253,10 +272,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
253
272
|
requirements: []
|
254
273
|
|
255
274
|
rubyforge_project: scrappy
|
256
|
-
rubygems_version: 1.3.
|
275
|
+
rubygems_version: 1.3.7
|
257
276
|
signing_key:
|
258
277
|
specification_version: 3
|
259
278
|
summary: Web scraper that allows producing RDF data out of plain web pages
|
260
279
|
test_files:
|
261
|
-
- test/test_scrappy.rb
|
262
280
|
- test/test_helper.rb
|
281
|
+
- test/test_scrappy.rb
|
@@ -1,101 +0,0 @@
|
|
1
|
-
# Hack to hide annoying gtk debug messages
|
2
|
-
old_stderr = $stderr.clone
|
3
|
-
$stderr.reopen '/dev/null'
|
4
|
-
require 'scrappy/webkit/webkit'
|
5
|
-
$stderr = old_stderr
|
6
|
-
|
7
|
-
module Scrappy
|
8
|
-
class VisualAgent < Agent
|
9
|
-
attr_reader :visible
|
10
|
-
|
11
|
-
def initialize args={}
|
12
|
-
super
|
13
|
-
|
14
|
-
@cv = new_cond
|
15
|
-
|
16
|
-
@webview = Gtk::WebKit::WebView.new
|
17
|
-
@webview.signal_connect("load_finished") { synchronize { @cv.signal } }
|
18
|
-
|
19
|
-
@window = Gtk::Window.new
|
20
|
-
@window.signal_connect("destroy") { Gtk.main_quit }
|
21
|
-
@window.add(@webview)
|
22
|
-
@window.set_size_request(1024, 600)
|
23
|
-
if args[:window] or (args[:window].nil? and Agent::Options.window)
|
24
|
-
@window.show_all
|
25
|
-
@visible = true
|
26
|
-
end
|
27
|
-
@mechanize = Mechanize.new
|
28
|
-
end
|
29
|
-
|
30
|
-
def uri
|
31
|
-
@uri
|
32
|
-
end
|
33
|
-
|
34
|
-
def uri= uri
|
35
|
-
# First, check if the requested uri is a valid HTML page
|
36
|
-
valid = begin
|
37
|
-
@mechanize.get(uri).is_a?(Mechanize::Page)
|
38
|
-
rescue
|
39
|
-
false
|
40
|
-
end
|
41
|
-
|
42
|
-
# Open the page in the browser if it's an HTML page
|
43
|
-
if valid
|
44
|
-
synchronize do
|
45
|
-
@webview.open uri.to_s
|
46
|
-
@cv.wait(60) # 1 minute to open the page
|
47
|
-
@uri = @webview.uri
|
48
|
-
end
|
49
|
-
else
|
50
|
-
@uri = nil
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def html_data?
|
55
|
-
uri.to_s != ""
|
56
|
-
end
|
57
|
-
|
58
|
-
def html
|
59
|
-
js "document.documentElement.outerHTML"
|
60
|
-
end
|
61
|
-
|
62
|
-
def add_visual_data!
|
63
|
-
js """var items = document.documentElement.getElementsByTagName('*');
|
64
|
-
var i=0;
|
65
|
-
for(var i=0; i<items.length; i++) {
|
66
|
-
var item = items[i];
|
67
|
-
item.setAttribute('vx', item.offsetLeft);
|
68
|
-
item.setAttribute('vy', item.offsetTop);
|
69
|
-
item.setAttribute('vw', item.offsetWidth);
|
70
|
-
item.setAttribute('vh', item.offsetHeight);
|
71
|
-
item.setAttribute('vsize', document.defaultView.getComputedStyle(item, null).getPropertyValue('font-size'));
|
72
|
-
var weight = document.defaultView.getComputedStyle(item, null).getPropertyValue('font-weight');
|
73
|
-
if (weight == 'normal') weight = 400;
|
74
|
-
if (weight == 'bold') weight = 700;
|
75
|
-
item.setAttribute('vweight', weight);
|
76
|
-
item.setAttribute('vcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('color'));
|
77
|
-
item.setAttribute('vbcolor', document.defaultView.getComputedStyle(item, null).getPropertyValue('background-color'));
|
78
|
-
}"""
|
79
|
-
end
|
80
|
-
|
81
|
-
def js code
|
82
|
-
old_title = @webview.title
|
83
|
-
@webview.execute_script("document.title = JSON.stringify(eval(#{ActiveSupport::JSON.encode(code)}))")
|
84
|
-
title = ActiveSupport::JSON.decode(@webview.title)
|
85
|
-
@webview.execute_script("document.title = #{ActiveSupport::JSON.encode(old_title)}")
|
86
|
-
title
|
87
|
-
end
|
88
|
-
|
89
|
-
def load_js url
|
90
|
-
function = """function include(destination) {
|
91
|
-
var e=window.document.createElement('script');
|
92
|
-
e.setAttribute('src',destination);
|
93
|
-
window.document.body.appendChild(e);
|
94
|
-
}"""
|
95
|
-
js function
|
96
|
-
js "include('#{url}')"
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
Thread.new { Gtk.main }
|
data/lib/scrappy/shell.rb
DELETED
@@ -1,87 +0,0 @@
|
|
1
|
-
require 'readline'
|
2
|
-
|
3
|
-
module Scrappy
|
4
|
-
class Shell
|
5
|
-
def initialize file=nil
|
6
|
-
@agent = Agent.create
|
7
|
-
@file = file
|
8
|
-
end
|
9
|
-
|
10
|
-
def run
|
11
|
-
commands = ['get', 'quit', 'help', 'annotate', 'html']
|
12
|
-
|
13
|
-
Readline.completion_append_character = " "
|
14
|
-
Readline.completer_word_break_characters = ""
|
15
|
-
Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
|
16
|
-
|
17
|
-
if @file
|
18
|
-
open(@file, 'r').lines.each do |line|
|
19
|
-
break if process(line) == :quit
|
20
|
-
end
|
21
|
-
else
|
22
|
-
begin
|
23
|
-
line = Readline.readline(bash, true)
|
24
|
-
code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
|
25
|
-
end while code != :quit
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
protected
|
30
|
-
def process raw_command
|
31
|
-
command = raw_command.strip
|
32
|
-
|
33
|
-
code = if command =~ /\Aget\W(.*)\Z/
|
34
|
-
puts @agent.proxy(:uri=>$1).output
|
35
|
-
puts
|
36
|
-
elsif command == 'help'
|
37
|
-
puts 'Available commands:'
|
38
|
-
puts ' get URL: Visit the specified URL'
|
39
|
-
puts ' html: Show HTML code of the current URL'
|
40
|
-
puts ' annotate: Start the annotation tool that helps building extractors'
|
41
|
-
puts ' help: Show this information'
|
42
|
-
puts ' quit: Exit scrappy shell'
|
43
|
-
puts
|
44
|
-
elsif command == 'annotate'
|
45
|
-
if @agent.class.to_s == 'Scrappy::VisualAgent' and @agent.visible
|
46
|
-
@agent.load_js "http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"
|
47
|
-
@agent.load_js "http://github.com/josei/scrappy/raw/master/lib/js/annotator.js"
|
48
|
-
puts "Use the browser's window to annotate resources"
|
49
|
-
puts
|
50
|
-
else
|
51
|
-
puts 'ERROR: Scrappy must be run with -v and -w options to use this feature'
|
52
|
-
puts
|
53
|
-
end
|
54
|
-
elsif command == 'html'
|
55
|
-
puts @agent.html
|
56
|
-
puts
|
57
|
-
elsif command == 'quit'
|
58
|
-
:quit
|
59
|
-
elsif command == '' or command[0..0] == '#'
|
60
|
-
nil
|
61
|
-
else
|
62
|
-
puts "ERROR: Unknown command '#{command}'"
|
63
|
-
puts
|
64
|
-
end
|
65
|
-
code
|
66
|
-
end
|
67
|
-
|
68
|
-
def bash
|
69
|
-
return '' if Options.quiet
|
70
|
-
location = if @agent.uri
|
71
|
-
uri = URI::parse(@agent.uri)
|
72
|
-
path = uri.path.to_s
|
73
|
-
path = path[0..0] + "..." + path[-16..-1] if path.size > 20
|
74
|
-
if uri.query
|
75
|
-
query = "?" + uri.query
|
76
|
-
query = "?..." + query[-10..-1] if query.size > 13
|
77
|
-
else
|
78
|
-
query = ""
|
79
|
-
end
|
80
|
-
"#{uri.base}#{path}#{query}"
|
81
|
-
else
|
82
|
-
''
|
83
|
-
end
|
84
|
-
"#{location}$ "
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
@@ -1,18 +0,0 @@
|
|
1
|
-
require 'gtk2'
|
2
|
-
module Gtk
|
3
|
-
module WebKit
|
4
|
-
end
|
5
|
-
end
|
6
|
-
|
7
|
-
require 'rbwebkitgtk.so'
|
8
|
-
|
9
|
-
class Gtk::WebKit::WebView
|
10
|
-
alias :load_html_string_no_defaults :load_html_string
|
11
|
-
def load_html_string(content, base_uri=nil)
|
12
|
-
load_html_string_no_defaults(content, base_uri)
|
13
|
-
end
|
14
|
-
|
15
|
-
def mark_text_matches(test, case_sensitive=false, limit=0)
|
16
|
-
mark_text_matches_with_limit(test, case_sensitive, limit)
|
17
|
-
end
|
18
|
-
end
|