scrappy 0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/{Manifest.txt → Manifest} +2 -1
- data/README.rdoc +9 -9
- data/Rakefile +16 -14
- data/bin/scrappy +2 -1
- data/lib/js/annotator.js +44 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/extractor.rb +14 -3
- data/lib/scrappy/agent/visual_agent.rb +34 -9
- data/lib/scrappy/shell.rb +19 -4
- data/scrappy.gemspec +55 -0
- metadata +40 -76
data/History.txt
CHANGED
data/{Manifest.txt → Manifest}
RENAMED
@@ -1,9 +1,9 @@
|
|
1
1
|
History.txt
|
2
|
-
Manifest.txt
|
3
2
|
README.rdoc
|
4
3
|
Rakefile
|
5
4
|
bin/scrappy
|
6
5
|
kb/elmundo.yarf
|
6
|
+
lib/js/annotator.js
|
7
7
|
lib/scrappy.rb
|
8
8
|
lib/scrappy/agent/agent.rb
|
9
9
|
lib/scrappy/agent/blind_agent.rb
|
@@ -17,3 +17,4 @@ lib/scrappy/support.rb
|
|
17
17
|
lib/scrappy/webkit/webkit.rb
|
18
18
|
test/test_helper.rb
|
19
19
|
test/test_scrappy.rb
|
20
|
+
Manifest
|
data/README.rdoc
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
=
|
1
|
+
= scrappy
|
2
2
|
|
3
3
|
* http://github.com/josei/scrappy
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
7
|
-
|
7
|
+
scrappy is a tool that allows extracting information from web pages and producing RDF data.
|
8
8
|
It uses the scraping ontology to define the mappings between HTML contents and RDF data.
|
9
9
|
|
10
10
|
An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
|
@@ -54,7 +54,7 @@ tool by typing:
|
|
54
54
|
|
55
55
|
$ scrappy --help
|
56
56
|
|
57
|
-
|
57
|
+
scrappy offers many different interfaces to get RDF data from a web page:
|
58
58
|
|
59
59
|
* Command-line interface:
|
60
60
|
|
@@ -63,7 +63,7 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
63
63
|
* Interactive shell:
|
64
64
|
|
65
65
|
$ scrappy -i
|
66
|
-
Launching
|
66
|
+
Launching scrappy Shell...
|
67
67
|
$ get elmundo.es
|
68
68
|
dc: http://purl.org/dc/elements/1.1/
|
69
69
|
owl: http://www.w3.org/2002/07/owl#
|
@@ -89,7 +89,7 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
89
89
|
* Web Service interface:
|
90
90
|
|
91
91
|
$ scrappy -s
|
92
|
-
Launching
|
92
|
+
Launching scrappy Web Server...
|
93
93
|
** Starting Mongrel on localhost:3434
|
94
94
|
|
95
95
|
Then point your browser to http://localhost:3434 for additional directions.
|
@@ -97,7 +97,7 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
97
97
|
* Web Proxy interface:
|
98
98
|
|
99
99
|
$ scrappy -S
|
100
|
-
Launching
|
100
|
+
Launching scrappy Web Proxy...
|
101
101
|
** Starting Mongrel on localhost:3434
|
102
102
|
|
103
103
|
Then configure your browser's HTTP proxy to http://localhost:3434 and browse http://www.elmundo.es
|
@@ -117,7 +117,7 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
117
117
|
|
118
118
|
* Ruby interface:
|
119
119
|
|
120
|
-
You can use
|
120
|
+
You can use scrappy in a Ruby program by requiring the gem:
|
121
121
|
|
122
122
|
require 'rubygems'
|
123
123
|
require 'scrappy'
|
@@ -126,13 +126,13 @@ Scrappy offers many different interfaces to get RDF data from a web page:
|
|
126
126
|
kb = RDF::Parser.parse(:rdf, open("kb.rdf").read)
|
127
127
|
|
128
128
|
# Create an agent
|
129
|
-
agent =
|
129
|
+
agent = scrappy::Agent.create :kb=>kb
|
130
130
|
|
131
131
|
# Get RDF output
|
132
132
|
output = agent.request :get, 'http://www.example.com'
|
133
133
|
|
134
134
|
# Output all titles from the web page
|
135
|
-
titles = output.find(
|
135
|
+
titles = output.find([], Node('dc:title'), nil)
|
136
136
|
titles.each { |title| puts title }
|
137
137
|
|
138
138
|
== INSTALL:
|
data/Rakefile
CHANGED
@@ -1,20 +1,22 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
|
3
|
-
require '
|
4
|
-
require 'fileutils'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
5
4
|
require './lib/scrappy'
|
6
5
|
|
7
|
-
|
6
|
+
Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
7
|
+
p.description = "RDF web scraper"
|
8
|
+
p.summary = "Web scraper that allows producing RDF data out of plain web pages"
|
9
|
+
p.url = "http://github.com/josei/scrappy"
|
10
|
+
p.author = "Jose Ignacio"
|
11
|
+
p.email = "joseignacio.fernandez@gmail.com"
|
12
|
+
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
|
+
p.ignore_pattern = ["pkg/*"]
|
14
|
+
p.development_dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5']]
|
15
|
+
end
|
8
16
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
self.developer 'Jose Ignacio', 'joseignacio.fernandez@gmail.com'
|
13
|
-
self.summary = "Web scraper that allows producing RDF data out of plain web pages"
|
14
|
-
self.post_install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
15
|
-
self.rubyforge_name = self.name
|
16
|
-
self.extra_deps = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1']]
|
17
|
+
Rake::RDocTask.new(:rdoc) do |rdoc|
|
18
|
+
rdoc.rdoc_files.include('README.rdoc').include('lib/**/*.rb')
|
19
|
+
rdoc.main = "README.rdoc"
|
17
20
|
end
|
18
21
|
|
19
|
-
|
20
|
-
Dir['tasks/**/*.rake'].each { |t| load t }
|
22
|
+
Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each
|
data/bin/scrappy
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
|
+
# encoding: UTF-8
|
2
3
|
|
3
4
|
stty_save = `stty -g`.chomp
|
4
5
|
trap('INT') { system('stty', stty_save); Scrappy::App.quit }
|
@@ -42,7 +43,7 @@ module Scrappy
|
|
42
43
|
opts.on('-c C', '--concurrence C') { |c| Options.concurrence = c.to_i }
|
43
44
|
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Options.concurrence = 1 }
|
44
45
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
45
|
-
opts.on('-v', '--visual') { Agent::Options.agent = :visual }
|
46
|
+
opts.on('-v', '--visual') { Agent::Options.agent = :visual; Options.concurrence = 1 }
|
46
47
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
47
48
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
48
49
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
data/lib/js/annotator.js
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
$(document).ready(function(){
|
2
|
+
$("body").append("<div id='myTrees'></div>")
|
3
|
+
$("#page > *").bind('mouseover', function(e){
|
4
|
+
e.stopPropagation();
|
5
|
+
$(this).addClass("changeBg");
|
6
|
+
})
|
7
|
+
.mouseout(function(){
|
8
|
+
$(this).removeClass("changeBg");
|
9
|
+
});
|
10
|
+
});
|
11
|
+
|
12
|
+
$(document).ready(function(){
|
13
|
+
$("*").bind('click', function(e){
|
14
|
+
e.stopPropagation();
|
15
|
+
var element = $(e.target).closest(this.tagName).get(0).tagName;
|
16
|
+
var parents = $(this).parents();
|
17
|
+
var string = element.toString();
|
18
|
+
for(j=0;j<parents.length;j++) {
|
19
|
+
string = string + " " + parents[j].tagName;
|
20
|
+
}
|
21
|
+
|
22
|
+
var tree = [];
|
23
|
+
var treeString = "";
|
24
|
+
for(h=parents.length-1; h>=0; h-- ) {
|
25
|
+
tree.push(parents[h].tagName);
|
26
|
+
|
27
|
+
if( treeString == "" ) {
|
28
|
+
treeString = treeString + parents[h].tagName;
|
29
|
+
} else {
|
30
|
+
treeString = treeString + " > " + parents[h].tagName;
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
tree.push(element);
|
35
|
+
treeString = treeString + " > " + element;
|
36
|
+
|
37
|
+
var myTrees = document.getElementById("myTrees");
|
38
|
+
var ul = document.createElement("ul");
|
39
|
+
var li = document.createElement("li");
|
40
|
+
myTrees.appendChild(ul);
|
41
|
+
li.innerHTML = treeString;
|
42
|
+
myTrees.appendChild(li);
|
43
|
+
});
|
44
|
+
});
|
data/lib/scrappy.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
|
1
3
|
module Scrappy
|
2
4
|
module Extractor
|
3
5
|
def extract uri, html, referenceable=nil
|
@@ -39,6 +41,7 @@ module Scrappy
|
|
39
41
|
bnode = Node(nil)
|
40
42
|
bnode.rdf::value = value
|
41
43
|
bnode.rdf::type = Node('rdf:Literal')
|
44
|
+
options[:triples].push *bnode.triples
|
42
45
|
bnode
|
43
46
|
else
|
44
47
|
value
|
@@ -53,7 +56,7 @@ module Scrappy
|
|
53
56
|
|
54
57
|
# Add referenceable data if requested
|
55
58
|
if options[:referenceable]
|
56
|
-
source = Node(
|
59
|
+
source = Node(node_hash(doc[:uri], doc[:content].path))
|
57
60
|
options[:triples] << [ object, Node("sc:source"), source ]
|
58
61
|
fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
|
59
62
|
fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
|
@@ -115,12 +118,13 @@ module Scrappy
|
|
115
118
|
def add_referenceable_data content, triples, referenceable
|
116
119
|
resources = triples.map{|s,p,o| [[s],[o]]}.flatten
|
117
120
|
|
118
|
-
fragment = Node(
|
121
|
+
fragment = Node(node_hash(uri, '/'))
|
119
122
|
selector = Node(nil)
|
120
123
|
presentation = Node(nil)
|
121
124
|
|
122
125
|
selector.rdf::type = Node('sc:UnivocalSelector')
|
123
126
|
selector.sc::path = '/'
|
127
|
+
selector.sc::children = content.search('*').size.to_s
|
124
128
|
selector.sc::uri = uri
|
125
129
|
|
126
130
|
fragment.sc::selector = selector
|
@@ -128,7 +132,7 @@ module Scrappy
|
|
128
132
|
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
|
129
133
|
|
130
134
|
content.search('*').each do |node|
|
131
|
-
fragment = Node(
|
135
|
+
fragment = Node(node_hash(uri, node.path))
|
132
136
|
|
133
137
|
if referenceable == :dump or resources.include?(fragment)
|
134
138
|
selector = Node(nil)
|
@@ -147,6 +151,8 @@ module Scrappy
|
|
147
151
|
presentation.sc::font_weight = node[:vweight].to_s if node[:vweight]
|
148
152
|
presentation.sc::color = node[:vcolor].to_s if node[:vcolor]
|
149
153
|
presentation.sc::background_color = node[:vbcolor].to_s if node[:vbcolor]
|
154
|
+
presentation.sc::text = node.text.strip
|
155
|
+
presentation.sc::children_count = node.search('*').size.to_s
|
150
156
|
|
151
157
|
fragment.sc::selector = selector
|
152
158
|
fragment.sc::presentation = presentation unless presentation.empty?
|
@@ -155,5 +161,10 @@ module Scrappy
|
|
155
161
|
end
|
156
162
|
end
|
157
163
|
end
|
164
|
+
|
165
|
+
def node_hash uri, path
|
166
|
+
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
167
|
+
"_:bnode#{digest}"
|
168
|
+
end
|
158
169
|
end
|
159
170
|
end
|
@@ -6,6 +6,8 @@ $stderr = old_stderr
|
|
6
6
|
|
7
7
|
module Scrappy
|
8
8
|
class VisualAgent < Agent
|
9
|
+
attr_reader :visible
|
10
|
+
|
9
11
|
def initialize args={}
|
10
12
|
super
|
11
13
|
|
@@ -18,18 +20,33 @@ module Scrappy
|
|
18
20
|
@window.signal_connect("destroy") { Gtk.main_quit }
|
19
21
|
@window.add(@webview)
|
20
22
|
@window.set_size_request(1024, 600)
|
21
|
-
|
23
|
+
if args[:window] or (args[:window].nil? and Agent::Options.window)
|
24
|
+
@window.show_all
|
25
|
+
@visible = true
|
26
|
+
end
|
22
27
|
end
|
23
28
|
|
24
29
|
def uri
|
25
|
-
@
|
30
|
+
@uri
|
26
31
|
end
|
27
32
|
|
28
33
|
def uri= uri
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
34
|
+
# First, check if the requested uri is a valid HTML page
|
35
|
+
valid = begin
|
36
|
+
Mechanize.new.get(uri).is_a?(Mechanize::Page)
|
37
|
+
rescue
|
38
|
+
false
|
39
|
+
end
|
40
|
+
|
41
|
+
# Open the page in the browser if it's an HTML page
|
42
|
+
if valid
|
43
|
+
synchronize do
|
44
|
+
@webview.open uri.to_s
|
45
|
+
@cv.wait(60) # 1 minute to open the page
|
46
|
+
@uri = @webview.uri
|
47
|
+
end
|
48
|
+
else
|
49
|
+
@uri = nil
|
33
50
|
end
|
34
51
|
end
|
35
52
|
|
@@ -40,7 +57,7 @@ module Scrappy
|
|
40
57
|
def html
|
41
58
|
js "document.documentElement.outerHTML"
|
42
59
|
end
|
43
|
-
|
60
|
+
|
44
61
|
def add_visual_data!
|
45
62
|
js """var items = document.documentElement.getElementsByTagName('*');
|
46
63
|
var i=0;
|
@@ -57,8 +74,6 @@ module Scrappy
|
|
57
74
|
}"""
|
58
75
|
end
|
59
76
|
|
60
|
-
|
61
|
-
private
|
62
77
|
def js code
|
63
78
|
old_title = @webview.title
|
64
79
|
@webview.execute_script("document.title = JSON.stringify(eval(#{ActiveSupport::JSON.encode(code)}))")
|
@@ -66,6 +81,16 @@ module Scrappy
|
|
66
81
|
@webview.execute_script("document.title = #{ActiveSupport::JSON.encode(old_title)}")
|
67
82
|
title
|
68
83
|
end
|
84
|
+
|
85
|
+
def load_js url
|
86
|
+
function = """function include(destination) {
|
87
|
+
var e=window.document.createElement('script');
|
88
|
+
e.setAttribute('src',destination);
|
89
|
+
window.document.body.appendChild(e);
|
90
|
+
}"""
|
91
|
+
js function
|
92
|
+
js "include('#{url}')"
|
93
|
+
end
|
69
94
|
end
|
70
95
|
end
|
71
96
|
|
data/lib/scrappy/shell.rb
CHANGED
@@ -6,7 +6,7 @@ module Scrappy
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def run
|
9
|
-
commands = ['get', '
|
9
|
+
commands = ['get', 'quit', 'help', 'annotate', 'html']
|
10
10
|
|
11
11
|
Readline.completion_append_character = " "
|
12
12
|
Readline.completer_word_break_characters = ""
|
@@ -30,20 +30,35 @@ module Scrappy
|
|
30
30
|
|
31
31
|
code = if command =~ /\Aget\W(.*)\Z/
|
32
32
|
puts @agent.proxy :get, $1
|
33
|
-
puts
|
33
|
+
puts
|
34
34
|
elsif command == 'help'
|
35
35
|
puts 'Available commands:'
|
36
36
|
puts ' get URL: Visit the specified URL'
|
37
|
+
puts ' html: Show HTML code of the current URL'
|
38
|
+
puts ' annotate: Start the annotation tool that helps building extractors'
|
37
39
|
puts ' help: Show this information'
|
38
40
|
puts ' quit: Exit scrappy shell'
|
39
|
-
puts
|
41
|
+
puts
|
42
|
+
elsif command == 'annotate'
|
43
|
+
if @agent.class.to_s == 'Scrappy::VisualAgent' and @agent.visible
|
44
|
+
@agent.load_js "http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"
|
45
|
+
@agent.load_js "http://github.com/josei/scrappy/raw/master/lib/js/annotator.js"
|
46
|
+
puts "Use the browser's window to annotate resources"
|
47
|
+
puts
|
48
|
+
else
|
49
|
+
puts 'ERROR: Scrappy must be run with -v and -w options to use this feature'
|
50
|
+
puts
|
51
|
+
end
|
52
|
+
elsif command == 'html'
|
53
|
+
puts @agent.html
|
54
|
+
puts
|
40
55
|
elsif command == 'quit'
|
41
56
|
:quit
|
42
57
|
elsif command == '' or command[0..0] == '#'
|
43
58
|
nil
|
44
59
|
else
|
45
60
|
puts "ERROR: Unknown command '#{command}'"
|
46
|
-
puts
|
61
|
+
puts
|
47
62
|
end
|
48
63
|
code
|
49
64
|
end
|
data/scrappy.gemspec
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{scrappy}
|
5
|
+
s.version = "0.1.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Jose Ignacio"]
|
9
|
+
s.date = %q{2010-10-29}
|
10
|
+
s.default_executable = %q{scrappy}
|
11
|
+
s.description = %q{RDF web scraper}
|
12
|
+
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
|
+
s.executables = ["scrappy"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "Manifest", "scrappy.gemspec"]
|
16
|
+
s.homepage = %q{http://github.com/josei/scrappy}
|
17
|
+
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
s.rubyforge_project = %q{scrappy}
|
21
|
+
s.rubygems_version = %q{1.3.6}
|
22
|
+
s.summary = %q{Web scraper that allows producing RDF data out of plain web pages}
|
23
|
+
s.test_files = ["test/test_scrappy.rb", "test/test_helper.rb"]
|
24
|
+
|
25
|
+
if s.respond_to? :specification_version then
|
26
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
27
|
+
s.specification_version = 3
|
28
|
+
|
29
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
30
|
+
s.add_development_dependency(%q<activesupport>, [">= 2.3.5"])
|
31
|
+
s.add_development_dependency(%q<markaby>, [">= 0.7.1"])
|
32
|
+
s.add_development_dependency(%q<camping>, ["= 2.0"])
|
33
|
+
s.add_development_dependency(%q<nokogiri>, [">= 1.4.1"])
|
34
|
+
s.add_development_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
|
+
s.add_development_dependency(%q<lightrdf>, [">= 0.1"])
|
36
|
+
s.add_development_dependency(%q<mongrel>, [">= 1.1.5"])
|
37
|
+
else
|
38
|
+
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
39
|
+
s.add_dependency(%q<markaby>, [">= 0.7.1"])
|
40
|
+
s.add_dependency(%q<camping>, ["= 2.0"])
|
41
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
42
|
+
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
43
|
+
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
44
|
+
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
45
|
+
end
|
46
|
+
else
|
47
|
+
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
48
|
+
s.add_dependency(%q<markaby>, [">= 0.7.1"])
|
49
|
+
s.add_dependency(%q<camping>, ["= 2.0"])
|
50
|
+
s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
|
51
|
+
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
52
|
+
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
53
|
+
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
54
|
+
end
|
55
|
+
end
|
metadata
CHANGED
@@ -5,7 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
|
8
|
+
- 1
|
9
|
+
version: 0.1.1
|
9
10
|
platform: ruby
|
10
11
|
authors:
|
11
12
|
- Jose Ignacio
|
@@ -13,7 +14,7 @@ autorequire:
|
|
13
14
|
bindir: bin
|
14
15
|
cert_chain: []
|
15
16
|
|
16
|
-
date: 2010-10-
|
17
|
+
date: 2010-10-29 00:00:00 +02:00
|
17
18
|
default_executable:
|
18
19
|
dependencies:
|
19
20
|
- !ruby/object:Gem::Dependency
|
@@ -28,7 +29,7 @@ dependencies:
|
|
28
29
|
- 3
|
29
30
|
- 5
|
30
31
|
version: 2.3.5
|
31
|
-
type: :
|
32
|
+
type: :development
|
32
33
|
version_requirements: *id001
|
33
34
|
- !ruby/object:Gem::Dependency
|
34
35
|
name: markaby
|
@@ -42,7 +43,7 @@ dependencies:
|
|
42
43
|
- 7
|
43
44
|
- 1
|
44
45
|
version: 0.7.1
|
45
|
-
type: :
|
46
|
+
type: :development
|
46
47
|
version_requirements: *id002
|
47
48
|
- !ruby/object:Gem::Dependency
|
48
49
|
name: camping
|
@@ -55,7 +56,7 @@ dependencies:
|
|
55
56
|
- 2
|
56
57
|
- 0
|
57
58
|
version: "2.0"
|
58
|
-
type: :
|
59
|
+
type: :development
|
59
60
|
version_requirements: *id003
|
60
61
|
- !ruby/object:Gem::Dependency
|
61
62
|
name: nokogiri
|
@@ -69,7 +70,7 @@ dependencies:
|
|
69
70
|
- 4
|
70
71
|
- 1
|
71
72
|
version: 1.4.1
|
72
|
-
type: :
|
73
|
+
type: :development
|
73
74
|
version_requirements: *id004
|
74
75
|
- !ruby/object:Gem::Dependency
|
75
76
|
name: mechanize
|
@@ -83,7 +84,7 @@ dependencies:
|
|
83
84
|
- 0
|
84
85
|
- 0
|
85
86
|
version: 1.0.0
|
86
|
-
type: :
|
87
|
+
type: :development
|
87
88
|
version_requirements: *id005
|
88
89
|
- !ruby/object:Gem::Dependency
|
89
90
|
name: lightrdf
|
@@ -96,94 +97,50 @@ dependencies:
|
|
96
97
|
- 0
|
97
98
|
- 1
|
98
99
|
version: "0.1"
|
99
|
-
type: :
|
100
|
+
type: :development
|
100
101
|
version_requirements: *id006
|
101
102
|
- !ruby/object:Gem::Dependency
|
102
|
-
name:
|
103
|
+
name: mongrel
|
103
104
|
prerelease: false
|
104
105
|
requirement: &id007 !ruby/object:Gem::Requirement
|
105
106
|
requirements:
|
106
107
|
- - ">="
|
107
108
|
- !ruby/object:Gem::Version
|
108
109
|
segments:
|
109
|
-
-
|
110
|
-
-
|
111
|
-
-
|
112
|
-
version:
|
110
|
+
- 1
|
111
|
+
- 1
|
112
|
+
- 5
|
113
|
+
version: 1.1.5
|
113
114
|
type: :development
|
114
115
|
version_requirements: *id007
|
115
|
-
|
116
|
-
|
117
|
-
prerelease: false
|
118
|
-
requirement: &id008 !ruby/object:Gem::Requirement
|
119
|
-
requirements:
|
120
|
-
- - ">="
|
121
|
-
- !ruby/object:Gem::Version
|
122
|
-
segments:
|
123
|
-
- 2
|
124
|
-
- 6
|
125
|
-
- 0
|
126
|
-
version: 2.6.0
|
127
|
-
type: :development
|
128
|
-
version_requirements: *id008
|
129
|
-
description: |-
|
130
|
-
Scrappy is a tool that allows extracting information from web pages and producing RDF data.
|
131
|
-
It uses the scraping ontology to define the mappings between HTML contents and RDF data.
|
132
|
-
|
133
|
-
An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
|
134
|
-
|
135
|
-
dc: http://purl.org/dc/elements/1.1/
|
136
|
-
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
|
137
|
-
sioc: http://rdfs.org/sioc/ns#
|
138
|
-
sc: http://lab.gsi.dit.upm.es/scraping.rdf#
|
139
|
-
*:
|
140
|
-
rdf:type: sc:Fragment
|
141
|
-
sc:selector:
|
142
|
-
*:
|
143
|
-
rdf:type: sc:UriSelector
|
144
|
-
rdf:value: "http://www.elmundo.es/"
|
145
|
-
sc:identifier:
|
146
|
-
*:
|
147
|
-
rdf:type: sc:BaseUriSelector
|
148
|
-
sc:subfragment:
|
149
|
-
*:
|
150
|
-
sc:type: sioc:Post
|
151
|
-
sc:selector:
|
152
|
-
*:
|
153
|
-
rdf:type: sc:CssSelector
|
154
|
-
rdf:value: ".noticia h2, .noticia h3, .noticia h4"
|
155
|
-
sc:identifier:
|
156
|
-
*:
|
157
|
-
rdf:type: sc:CssSelector
|
158
|
-
rdf:value: "a"
|
159
|
-
sc:attribute: "href"
|
160
|
-
sc:subfragment:
|
161
|
-
*:
|
162
|
-
sc:type: rdf:Literal
|
163
|
-
sc:relation: dc:title
|
164
|
-
sc:selector:
|
165
|
-
*:
|
166
|
-
rdf:type: sc:CssSelector
|
167
|
-
rdf:value: "a"
|
168
|
-
|
169
|
-
(The above code is serialized using YARF format, supported by LightRDF gem, as well as
|
170
|
-
RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
|
171
|
-
email:
|
172
|
-
- joseignacio.fernandez@gmail.com
|
116
|
+
description: RDF web scraper
|
117
|
+
email: joseignacio.fernandez@gmail.com
|
173
118
|
executables:
|
174
119
|
- scrappy
|
175
120
|
extensions: []
|
176
121
|
|
177
122
|
extra_rdoc_files:
|
178
|
-
-
|
179
|
-
-
|
123
|
+
- README.rdoc
|
124
|
+
- bin/scrappy
|
125
|
+
- lib/js/annotator.js
|
126
|
+
- lib/scrappy.rb
|
127
|
+
- lib/scrappy/agent/agent.rb
|
128
|
+
- lib/scrappy/agent/blind_agent.rb
|
129
|
+
- lib/scrappy/agent/cluster.rb
|
130
|
+
- lib/scrappy/agent/extractor.rb
|
131
|
+
- lib/scrappy/agent/visual_agent.rb
|
132
|
+
- lib/scrappy/proxy.rb
|
133
|
+
- lib/scrappy/server.rb
|
134
|
+
- lib/scrappy/shell.rb
|
135
|
+
- lib/scrappy/support.rb
|
136
|
+
- lib/scrappy/webkit/webkit.rb
|
180
137
|
files:
|
181
138
|
- History.txt
|
182
|
-
- Manifest.txt
|
183
139
|
- README.rdoc
|
184
140
|
- Rakefile
|
185
141
|
- bin/scrappy
|
186
142
|
- kb/elmundo.yarf
|
143
|
+
- lib/js/annotator.js
|
187
144
|
- lib/scrappy.rb
|
188
145
|
- lib/scrappy/agent/agent.rb
|
189
146
|
- lib/scrappy/agent/blind_agent.rb
|
@@ -197,12 +154,18 @@ files:
|
|
197
154
|
- lib/scrappy/webkit/webkit.rb
|
198
155
|
- test/test_helper.rb
|
199
156
|
- test/test_scrappy.rb
|
157
|
+
- Manifest
|
158
|
+
- scrappy.gemspec
|
200
159
|
has_rdoc: true
|
201
160
|
homepage: http://github.com/josei/scrappy
|
202
161
|
licenses: []
|
203
162
|
|
204
163
|
post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
|
205
164
|
rdoc_options:
|
165
|
+
- --line-numbers
|
166
|
+
- --inline-source
|
167
|
+
- --title
|
168
|
+
- Scrappy
|
206
169
|
- --main
|
207
170
|
- README.rdoc
|
208
171
|
require_paths:
|
@@ -219,8 +182,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
219
182
|
- - ">="
|
220
183
|
- !ruby/object:Gem::Version
|
221
184
|
segments:
|
222
|
-
-
|
223
|
-
|
185
|
+
- 1
|
186
|
+
- 2
|
187
|
+
version: "1.2"
|
224
188
|
requirements: []
|
225
189
|
|
226
190
|
rubyforge_project: scrappy
|