scrappy 0.1 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/{Manifest.txt → Manifest} +2 -1
- data/README.rdoc +9 -9
- data/Rakefile +16 -14
- data/bin/scrappy +2 -1
- data/lib/js/annotator.js +44 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/extractor.rb +14 -3
- data/lib/scrappy/agent/visual_agent.rb +34 -9
- data/lib/scrappy/shell.rb +19 -4
- data/scrappy.gemspec +55 -0
- metadata +40 -76
data/History.txt
CHANGED
data/{Manifest.txt → Manifest}
RENAMED
@@ -1,9 +1,9 @@
|
|
1
1
|
History.txt
|
2
|
-
Manifest.txt
|
3
2
|
README.rdoc
|
4
3
|
Rakefile
|
5
4
|
bin/scrappy
|
6
5
|
kb/elmundo.yarf
|
6
|
+
lib/js/annotator.js
|
7
7
|
lib/scrappy.rb
|
8
8
|
lib/scrappy/agent/agent.rb
|
9
9
|
lib/scrappy/agent/blind_agent.rb
|
@@ -17,3 +17,4 @@ lib/scrappy/support.rb
|
|
17
17
|
lib/scrappy/webkit/webkit.rb
|
18
18
|
test/test_helper.rb
|
19
19
|
test/test_scrappy.rb
|
20
|
+
Manifest
|
data/README.rdoc
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
=
|
1
|
+
= scrappy
|
2
2
|
|
3
3
|
* http://github.com/josei/scrappy
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
7
|
-
|
7
|
+
scrappy is a tool that allows extracting information from web pages and producing RDF data.
|
8
8
|
It uses the scraping ontology to define the mappings between HTML contents and RDF data.
|
9
9
|
|
10
10
|
An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
|
@@ -54,7 +54,7 @@ tool by typing:
|
|
54
54
|
|
55
55
|
$ scrappy --help
|
56
56
|
|
57
|
-
|
57
|
+
scrappy offers many different interfaces to get RDF data from a web page:
|
58
58
|
|
59
59
|
* Command-line interface:
|
60
60
|
|
@@ -63,7 +63,7 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
63
63
|
* Interactive shell:
|
64
64
|
|
65
65
|
$ scrappy -i
|
66
|
-
Launching
|
66
|
+
Launching scrappy Shell...
|
67
67
|
$ get elmundo.es
|
68
68
|
dc: http://purl.org/dc/elements/1.1/
|
69
69
|
owl: http://www.w3.org/2002/07/owl#
|
@@ -89,7 +89,7 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
89
89
|
* Web Service interface:
|
90
90
|
|
91
91
|
$ scrappy -s
|
92
|
-
Launching
|
92
|
+
Launching scrappy Web Server...
|
93
93
|
** Starting Mongrel on localhost:3434
|
94
94
|
|
95
95
|
Then point your browser to http://localhost:3434 for additional directions.
|
@@ -97,7 +97,7 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
97
97
|
* Web Proxy interface:
|
98
98
|
|
99
99
|
$ scrappy -S
|
100
|
-
Launching
|
100
|
+
Launching scrappy Web Proxy...
|
101
101
|
** Starting Mongrel on localhost:3434
|
102
102
|
|
103
103
|
Then configure your browser's HTTP proxy to http://localhost:3434 and browse http://www.elmundo.es
|
@@ -117,7 +117,7 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
117
117
|
|
118
118
|
* Ruby interface:
|
119
119
|
|
120
|
-
You can use
|
120
|
+
You can use scrappy in a Ruby program by requiring the gem:
|
121
121
|
|
122
122
|
require 'rubygems'
|
123
123
|
require 'scrappy'
|
@@ -126,13 +126,13 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
126
126
|
kb = RDF::Parser.parse(:rdf, open("kb.rdf").read)
|
127
127
|
|
128
128
|
# Create an agent
|
129
|
-
agent =
|
129
|
+
agent = scrappy::Agent.create :kb=>kb
|
130
130
|
|
131
131
|
# Get RDF output
|
132
132
|
output = agent.request :get, 'http://www.example.com'
|
133
133
|
|
134
134
|
# Output all titles from the web page
|
135
|
-
titles = output.find(
|
135
|
+
titles = output.find([], Node('dc:title'), nil)
|
136
136
|
titles.each { |title| puts title }
|
137
137
|
|
138
138
|
== INSTALL:
|
data/Rakefile
CHANGED
@@ -1,20 +1,22 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
|
3
|
-
require '
|
4
|
-
require 'fileutils'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
5
4
|
require './lib/scrappy'
|
6
5
|
|
7
|
-
|
6
|
+
Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
7
|
+
p.description = "RDF web scraper"
|
8
|
+
p.summary = "Web scraper that allows producing RDF data out of plain web pages"
|
9
|
+
p.url = "http://github.com/josei/scrappy"
|
10
|
+
p.author = "Jose Ignacio"
|
11
|
+
p.email = "joseignacio.fernandez@gmail.com"
|
12
|
+
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
|
+
p.ignore_pattern = ["pkg/*"]
|
14
|
+
p.development_dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5']]
|
15
|
+
end
|
8
16
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
self.developer 'Jose Ignacio', 'joseignacio.fernandez@gmail.com'
|
13
|
-
self.summary = "Web scraper that allows producing RDF data out of plain web pages"
|
14
|
-
self.post_install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
15
|
-
self.rubyforge_name = self.name
|
16
|
-
self.extra_deps = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1']]
|
17
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
18
|
+
rdoc.rdoc_files.include('README.rdoc').include('lib/**/*.rb')
|
19
|
+
rdoc.main = "README.rdoc"
|
17
20
|
end
|
18
21
|
|
19
|
-
|
20
|
-
Dir['tasks/**/*.rake'].each { |t| load t }
|
22
|
+
Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each
|
data/bin/scrappy
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
+
# encoding: UTF-8
|
2
3
|
|
3
4
|
stty_save = `stty -g`.chomp
|
4
5
|
trap('INT') { system('stty', stty_save); Scrappy::App.quit }
|
@@ -42,7 +43,7 @@ module Scrappy
|
|
42
43
|
opts.on('-c C', '--concurrence C') { |c| Options.concurrence = c.to_i }
|
43
44
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Options.concurrence = 1 }
|
44
45
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
45
|
-
opts.on('-v', '--visual') { Agent::Options.agent = :visual }
|
46
|
+
opts.on('-v', '--visual') { Agent::Options.agent = :visual; Options.concurrence = 1 }
|
46
47
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
47
48
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
48
49
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
data/lib/js/annotator.js
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
$(document).ready(function(){
|
2
|
+
$("body").append("<div id='myTrees'></div>")
|
3
|
+
$("#page > *").bind('mouseover', function(e){
|
4
|
+
e.stopPropagation();
|
5
|
+
$(this).addClass("changeBg");
|
6
|
+
})
|
7
|
+
.mouseout(function(){
|
8
|
+
$(this).removeClass("changeBg");
|
9
|
+
});
|
10
|
+
});
|
11
|
+
|
12
|
+
$(document).ready(function(){
|
13
|
+
$("*").bind('click', function(e){
|
14
|
+
e.stopPropagation();
|
15
|
+
var element = $(e.target).closest(this.tagName).get(0).tagName;
|
16
|
+
var parents = $(this).parents();
|
17
|
+
var string = element.toString();
|
18
|
+
for(j=0;j<parents.length;j++) {
|
19
|
+
string = string + " " + parents[j].tagName;
|
20
|
+
}
|
21
|
+
|
22
|
+
var tree = [];
|
23
|
+
var treeString = "";
|
24
|
+
for(h=parents.length-1; h>=0; h-- ) {
|
25
|
+
tree.push(parents[h].tagName);
|
26
|
+
|
27
|
+
if( treeString == "" ) {
|
28
|
+
treeString = treeString + parents[h].tagName;
|
29
|
+
} else {
|
30
|
+
treeString = treeString + " > " + parents[h].tagName;
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
tree.push(element);
|
35
|
+
treeString = treeString + " > " + element;
|
36
|
+
|
37
|
+
var myTrees = document.getElementById("myTrees");
|
38
|
+
var ul = document.createElement("ul");
|
39
|
+
var li = document.createElement("li");
|
40
|
+
myTrees.appendChild(ul);
|
41
|
+
li.innerHTML = treeString;
|
42
|
+
myTrees.appendChild(li);
|
43
|
+
});
|
44
|
+
});
|
data/lib/scrappy.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
|
1
3
|
module Scrappy
|
2
4
|
module Extractor
|
3
5
|
def extract uri, html, referenceable=nil
|
@@ -39,6 +41,7 @@ module Scrappy
|
|
39
41
|
bnode = Node(nil)
|
40
42
|
bnode.rdf::value = value
|
41
43
|
bnode.rdf::type = Node('rdf:Literal')
|
44
|
+
options[:triples].push *bnode.triples
|
42
45
|
bnode
|
43
46
|
else
|
44
47
|
value
|
@@ -53,7 +56,7 @@ module Scrappy
|
|
53
56
|
|
54
57
|
# Add referenceable data if requested
|
55
58
|
if options[:referenceable]
|
56
|
-
source = Node(
|
59
|
+
source = Node(node_hash(doc[:uri], doc[:content].path))
|
57
60
|
options[:triples] << [ object, Node("sc:source"), source ]
|
58
61
|
fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
|
59
62
|
fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
|
@@ -115,12 +118,13 @@ module Scrappy
|
|
115
118
|
def add_referenceable_data content, triples, referenceable
|
116
119
|
resources = triples.map{|s,p,o| [[s],[o]]}.flatten
|
117
120
|
|
118
|
-
fragment = Node(
|
121
|
+
fragment = Node(node_hash(uri, '/'))
|
119
122
|
selector = Node(nil)
|
120
123
|
presentation = Node(nil)
|
121
124
|
|
122
125
|
selector.rdf::type = Node('sc:UnivocalSelector')
|
123
126
|
selector.sc::path = '/'
|
127
|
+
selector.sc::children = content.search('*').size.to_s
|
124
128
|
selector.sc::uri = uri
|
125
129
|
|
126
130
|
fragment.sc::selector = selector
|
@@ -128,7 +132,7 @@ module Scrappy
|
|
128
132
|
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
|
129
133
|
|
130
134
|
content.search('*').each do |node|
|
131
|
-
fragment = Node(
|
135
|
+
fragment = Node(node_hash(uri, node.path))
|
132
136
|
|
133
137
|
if referenceable == :dump or resources.include?(fragment)
|
134
138
|
selector = Node(nil)
|
@@ -147,6 +151,8 @@ module Scrappy
|
|
147
151
|
presentation.sc::font_weight = node[:vweight].to_s if node[:vweight]
|
148
152
|
presentation.sc::color = node[:vcolor].to_s if node[:vcolor]
|
149
153
|
presentation.sc::background_color = node[:vbcolor].to_s if node[:vbcolor]
|
154
|
+
presentation.sc::text = node.text.strip
|
155
|
+
presentation.sc::children_count = node.search('*').size.to_s
|
150
156
|
|
151
157
|
fragment.sc::selector = selector
|
152
158
|
fragment.sc::presentation = presentation unless presentation.empty?
|
@@ -155,5 +161,10 @@ module Scrappy
|
|
155
161
|
end
|
156
162
|
end
|
157
163
|
end
|
164
|
+
|
165
|
+
def node_hash uri, path
|
166
|
+
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
167
|
+
"_:bnode#{digest}"
|
168
|
+
end
|
158
169
|
end
|
159
170
|
end
|
@@ -6,6 +6,8 @@ $stderr = old_stderr
|
|
6
6
|
|
7
7
|
module Scrappy
|
8
8
|
class VisualAgent < Agent
|
9
|
+
attr_reader :visible
|
10
|
+
|
9
11
|
def initialize args={}
|
10
12
|
super
|
11
13
|
|
@@ -18,18 +20,33 @@ module Scrappy
|
|
18
20
|
@window.signal_connect("destroy") { Gtk.main_quit }
|
19
21
|
@window.add(@webview)
|
20
22
|
@window.set_size_request(1024, 600)
|
21
|
-
|
23
|
+
if args[:window] or (args[:window].nil? and Agent::Options.window)
|
24
|
+
@window.show_all
|
25
|
+
@visible = true
|
26
|
+
end
|
22
27
|
end
|
23
28
|
|
24
29
|
def uri
|
25
|
-
@
|
30
|
+
@uri
|
26
31
|
end
|
27
32
|
|
28
33
|
def uri= uri
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
34
|
+
# First, check if the requested uri is a valid HTML page
|
35
|
+
valid = begin
|
36
|
+
Mechanize.new.get(uri).is_a?(Mechanize::Page)
|
37
|
+
rescue
|
38
|
+
false
|
39
|
+
end
|
40
|
+
|
41
|
+
# Open the page in the browser if it's an HTML page
|
42
|
+
if valid
|
43
|
+
synchronize do
|
44
|
+
@webview.open uri.to_s
|
45
|
+
@cv.wait(60) # 1 minute to open the page
|
46
|
+
@uri = @webview.uri
|
47
|
+
end
|
48
|
+
else
|
49
|
+
@uri = nil
|
33
50
|
end
|
34
51
|
end
|
35
52
|
|
@@ -40,7 +57,7 @@ module Scrappy
|
|
40
57
|
def html
|
41
58
|
js "document.documentElement.outerHTML"
|
42
59
|
end
|
43
|
-
|
60
|
+
|
44
61
|
def add_visual_data!
|
45
62
|
js """var items = document.documentElement.getElementsByTagName('*');
|
46
63
|
var i=0;
|
@@ -57,8 +74,6 @@ module Scrappy
|
|
57
74
|
}"""
|
58
75
|
end
|
59
76
|
|
60
|
-
|
61
|
-
private
|
62
77
|
def js code
|
63
78
|
old_title = @webview.title
|
64
79
|
@webview.execute_script("document.title = JSON.stringify(eval(#{ActiveSupport::JSON.encode(code)}))")
|
@@ -66,6 +81,16 @@ module Scrappy
|
|
66
81
|
@webview.execute_script("document.title = #{ActiveSupport::JSON.encode(old_title)}")
|
67
82
|
title
|
68
83
|
end
|
84
|
+
|
85
|
+
def load_js url
|
86
|
+
function = """function include(destination) {
|
87
|
+
var e=window.document.createElement('script');
|
88
|
+
e.setAttribute('src',destination);
|
89
|
+
window.document.body.appendChild(e);
|
90
|
+
}"""
|
91
|
+
js function
|
92
|
+
js "include('#{url}')"
|
93
|
+
end
|
69
94
|
end
|
70
95
|
end
|
71
96
|
|
data/lib/scrappy/shell.rb
CHANGED
@@ -6,7 +6,7 @@ module Scrappy
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def run
|
9
|
-
commands = ['get', '
|
9
|
+
commands = ['get', 'quit', 'help', 'annotate', 'html']
|
10
10
|
|
11
11
|
Readline.completion_append_character = " "
|
12
12
|
Readline.completer_word_break_characters = ""
|
@@ -30,20 +30,35 @@ module Scrappy
|
|
30
30
|
|
31
31
|
code = if command =~ /\Aget\W(.*)\Z/
|
32
32
|
puts @agent.proxy :get, $1
|
33
|
-
puts
|
33
|
+
puts
|
34
34
|
elsif command == 'help'
|
35
35
|
puts 'Available commands:'
|
36
36
|
puts ' get URL: Visit the specified URL'
|
37
|
+
puts ' html: Show HTML code of the current URL'
|
38
|
+
puts ' annotate: Start the annotation tool that helps building extractors'
|
37
39
|
puts ' help: Show this information'
|
38
40
|
puts ' quit: Exit scrappy shell'
|
39
|
-
puts
|
41
|
+
puts
|
42
|
+
elsif command == 'annotate'
|
43
|
+
if @agent.class.to_s == 'Scrappy::VisualAgent' and @agent.visible
|
44
|
+
@agent.load_js "http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"
|
45
|
+
@agent.load_js "http://github.com/josei/scrappy/raw/master/lib/js/annotator.js"
|
46
|
+
puts "Use the browser's window to annotate resources"
|
47
|
+
puts
|
48
|
+
else
|
49
|
+
puts 'ERROR: Scrappy must be run with -v and -w options to use this feature'
|
50
|
+
puts
|
51
|
+
end
|
52
|
+
elsif command == 'html'
|
53
|
+
puts @agent.html
|
54
|
+
puts
|
40
55
|
elsif command == 'quit'
|
41
56
|
:quit
|
42
57
|
elsif command == '' or command[0..0] == '#'
|
43
58
|
nil
|
44
59
|
else
|
45
60
|
puts "ERROR: Unknown command '#{command}'"
|
46
|
-
puts
|
61
|
+
puts
|
47
62
|
end
|
48
63
|
code
|
49
64
|
end
|
data/scrappy.gemspec
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{scrappy}
|
5
|
+
s.version = "0.1.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Jose Ignacio"]
|
9
|
+
s.date = %q{2010-10-29}
|
10
|
+
s.default_executable = %q{scrappy}
|
11
|
+
s.description = %q{RDF web scraper}
|
12
|
+
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
|
+
s.executables = ["scrappy"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "Manifest", "scrappy.gemspec"]
|
16
|
+
s.homepage = %q{http://github.com/josei/scrappy}
|
17
|
+
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
s.rubyforge_project = %q{scrappy}
|
21
|
+
s.rubygems_version = %q{1.3.6}
|
22
|
+
s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
|
23
|
+
s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
|
24
|
+
|
25
|
+
if s.respond_to? :specification_version then
|
26
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
27
|
+
s.specification_version = 3
|
28
|
+
|
29
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
30
|
+
s.add_development_dependency(%q<activesupport>, [">= 2.3.5"])
|
31
|
+
s.add_development_dependency(%q<markaby>, [">= 0.7.1"])
|
32
|
+
s.add_development_dependency(%q<camping>, ["= 2.0"])
|
33
|
+
s.add_development_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
|
+
s.add_development_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
+
s.add_development_dependency(%q<lightrdf>, [">= 0.1"])
|
36
|
+
s.add_development_dependency(%q<mongrel>, [">= 1.1.5"])
|
37
|
+
else
|
38
|
+
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
39
|
+
s.add_dependency(%q<markaby>, [">= 0.7.1"])
|
40
|
+
s.add_dependency(%q<camping>, ["= 2.0"])
|
41
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
42
|
+
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
43
|
+
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
44
|
+
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
45
|
+
end
|
46
|
+
else
|
47
|
+
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
48
|
+
s.add_dependency(%q<markaby>, [">= 0.7.1"])
|
49
|
+
s.add_dependency(%q<camping>, ["= 2.0"])
|
50
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
51
|
+
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
52
|
+
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
53
|
+
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
54
|
+
end
|
55
|
+
end
|
metadata
CHANGED
@@ -5,7 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
9
10
|
platform: ruby
|
10
11
|
authors:
|
11
12
|
- Jose Ignacio
|
@@ -13,7 +14,7 @@ autorequire:
|
|
13
14
|
bindir: bin
|
14
15
|
cert_chain: []
|
15
16
|
|
16
|
-
date: 2010-10-
|
17
|
+
date: 2010-10-29 00:00:00 +02:00
|
17
18
|
default_executable:
|
18
19
|
dependencies:
|
19
20
|
- !ruby/object:Gem::Dependency
|
@@ -28,7 +29,7 @@ dependencies:
|
|
28
29
|
- 3
|
29
30
|
- 5
|
30
31
|
version: 2.3.5
|
31
|
-
type: :
|
32
|
+
type: :development
|
32
33
|
version_requirements: *id001
|
33
34
|
- !ruby/object:Gem::Dependency
|
34
35
|
name: markaby
|
@@ -42,7 +43,7 @@ dependencies:
|
|
42
43
|
- 7
|
43
44
|
- 1
|
44
45
|
version: 0.7.1
|
45
|
-
type: :
|
46
|
+
type: :development
|
46
47
|
version_requirements: *id002
|
47
48
|
- !ruby/object:Gem::Dependency
|
48
49
|
name: camping
|
@@ -55,7 +56,7 @@ dependencies:
|
|
55
56
|
- 2
|
56
57
|
- 0
|
57
58
|
version: "2.0"
|
58
|
-
type: :
|
59
|
+
type: :development
|
59
60
|
version_requirements: *id003
|
60
61
|
- !ruby/object:Gem::Dependency
|
61
62
|
name: nokogiri
|
@@ -69,7 +70,7 @@ dependencies:
|
|
69
70
|
- 4
|
70
71
|
- 1
|
71
72
|
version: 1.4.1
|
72
|
-
type: :
|
73
|
+
type: :development
|
73
74
|
version_requirements: *id004
|
74
75
|
- !ruby/object:Gem::Dependency
|
75
76
|
name: mechanize
|
@@ -83,7 +84,7 @@ dependencies:
|
|
83
84
|
- 0
|
84
85
|
- 0
|
85
86
|
version: 1.0.0
|
86
|
-
type: :
|
87
|
+
type: :development
|
87
88
|
version_requirements: *id005
|
88
89
|
- !ruby/object:Gem::Dependency
|
89
90
|
name: lightrdf
|
@@ -96,94 +97,50 @@ dependencies:
|
|
96
97
|
- 0
|
97
98
|
- 1
|
98
99
|
version: "0.1"
|
99
|
-
type: :
|
100
|
+
type: :development
|
100
101
|
version_requirements: *id006
|
101
102
|
- !ruby/object:Gem::Dependency
|
102
|
-
name:
|
103
|
+
name: mongrel
|
103
104
|
prerelease: false
|
104
105
|
requirement: &id007 !ruby/object:Gem::Requirement
|
105
106
|
requirements:
|
106
107
|
- - ">="
|
107
108
|
- !ruby/object:Gem::Version
|
108
109
|
segments:
|
109
|
-
-
|
110
|
-
-
|
111
|
-
-
|
112
|
-
version:
|
110
|
+
- 1
|
111
|
+
- 1
|
112
|
+
- 5
|
113
|
+
version: 1.1.5
|
113
114
|
type: :development
|
114
115
|
version_requirements: *id007
|
115
|
-
|
116
|
-
|
117
|
-
prerelease: false
|
118
|
-
requirement: &id008 !ruby/object:Gem::Requirement
|
119
|
-
requirements:
|
120
|
-
- - ">="
|
121
|
-
- !ruby/object:Gem::Version
|
122
|
-
segments:
|
123
|
-
- 2
|
124
|
-
- 6
|
125
|
-
- 0
|
126
|
-
version: 2.6.0
|
127
|
-
type: :development
|
128
|
-
version_requirements: *id008
|
129
|
-
description: |-
|
130
|
-
Scrappy is a tool that allows extracting information from web pages and producing RDF data.
|
131
|
-
It uses the scraping ontology to define the mappings between HTML contents and RDF data.
|
132
|
-
|
133
|
-
An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
|
134
|
-
|
135
|
-
dc: http://purl.org/dc/elements/1.1/
|
136
|
-
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
|
137
|
-
sioc: http://rdfs.org/sioc/ns#
|
138
|
-
sc: http://lab.gsi.dit.upm.es/scraping.rdf#
|
139
|
-
*:
|
140
|
-
rdf:type: sc:Fragment
|
141
|
-
sc:selector:
|
142
|
-
*:
|
143
|
-
rdf:type: sc:UriSelector
|
144
|
-
rdf:value: "http://www.elmundo.es/"
|
145
|
-
sc:identifier:
|
146
|
-
*:
|
147
|
-
rdf:type: sc:BaseUriSelector
|
148
|
-
sc:subfragment:
|
149
|
-
*:
|
150
|
-
sc:type: sioc:Post
|
151
|
-
sc:selector:
|
152
|
-
*:
|
153
|
-
rdf:type: sc:CssSelector
|
154
|
-
rdf:value: ".noticia h2, .noticia h3, .noticia h4"
|
155
|
-
sc:identifier:
|
156
|
-
*:
|
157
|
-
rdf:type: sc:CssSelector
|
158
|
-
rdf:value: "a"
|
159
|
-
sc:attribute: "href"
|
160
|
-
sc:subfragment:
|
161
|
-
*:
|
162
|
-
sc:type: rdf:Literal
|
163
|
-
sc:relation: dc:title
|
164
|
-
sc:selector:
|
165
|
-
*:
|
166
|
-
rdf:type: sc:CssSelector
|
167
|
-
rdf:value: "a"
|
168
|
-
|
169
|
-
(The above code is serialized using YARF format, supported by LightRDF gem, as well as
|
170
|
-
RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
|
171
|
-
email:
|
172
|
-
- joseignacio.fernandez@gmail.com
|
116
|
+
description: RDF web scraper
|
117
|
+
email: joseignacio.fernandez@gmail.com
|
173
118
|
executables:
|
174
119
|
- scrappy
|
175
120
|
extensions: []
|
176
121
|
|
177
122
|
extra_rdoc_files:
|
178
|
-
-
|
179
|
-
-
|
123
|
+
- README.rdoc
|
124
|
+
- bin/scrappy
|
125
|
+
- lib/js/annotator.js
|
126
|
+
- lib/scrappy.rb
|
127
|
+
- lib/scrappy/agent/agent.rb
|
128
|
+
- lib/scrappy/agent/blind_agent.rb
|
129
|
+
- lib/scrappy/agent/cluster.rb
|
130
|
+
- lib/scrappy/agent/extractor.rb
|
131
|
+
- lib/scrappy/agent/visual_agent.rb
|
132
|
+
- lib/scrappy/proxy.rb
|
133
|
+
- lib/scrappy/server.rb
|
134
|
+
- lib/scrappy/shell.rb
|
135
|
+
- lib/scrappy/support.rb
|
136
|
+
- lib/scrappy/webkit/webkit.rb
|
180
137
|
files:
|
181
138
|
- History.txt
|
182
|
-
- Manifest.txt
|
183
139
|
- README.rdoc
|
184
140
|
- Rakefile
|
185
141
|
- bin/scrappy
|
186
142
|
- kb/elmundo.yarf
|
143
|
+
- lib/js/annotator.js
|
187
144
|
- lib/scrappy.rb
|
188
145
|
- lib/scrappy/agent/agent.rb
|
189
146
|
- lib/scrappy/agent/blind_agent.rb
|
@@ -197,12 +154,18 @@ files:
|
|
197
154
|
- lib/scrappy/webkit/webkit.rb
|
198
155
|
- test/test_helper.rb
|
199
156
|
- test/test_scrappy.rb
|
157
|
+
- Manifest
|
158
|
+
- scrappy.gemspec
|
200
159
|
has_rdoc: true
|
201
160
|
homepage: http://github.com/josei/scrappy
|
202
161
|
licenses: []
|
203
162
|
|
204
163
|
post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
|
205
164
|
rdoc_options:
|
165
|
+
- --line-numbers
|
166
|
+
- --inline-source
|
167
|
+
- --title
|
168
|
+
- Scrappy
|
206
169
|
- --main
|
207
170
|
- README.rdoc
|
208
171
|
require_paths:
|
@@ -219,8 +182,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
219
182
|
- - ">="
|
220
183
|
- !ruby/object:Gem::Version
|
221
184
|
segments:
|
222
|
-
-
|
223
|
-
|
185
|
+
- 1
|
186
|
+
- 2
|
187
|
+
version: "1.2"
|
224
188
|
requirements: []
|
225
189
|
|
226
190
|
rubyforge_project: scrappy
|