scrappy 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/Manifest +21 -14
- data/README.rdoc +5 -9
- data/Rakefile +1 -2
- data/bin/scrappy +141 -51
- data/lib/scrappy.rb +6 -9
- data/lib/scrappy/agent/agent.rb +3 -3
- data/lib/scrappy/extractor/extractor.rb +108 -0
- data/lib/scrappy/{agent → extractor}/formats.rb +0 -0
- data/lib/scrappy/extractor/fragment.rb +111 -0
- data/lib/scrappy/extractor/selector.rb +41 -0
- data/lib/scrappy/{selectors → extractor/selectors}/base_uri.rb +1 -3
- data/lib/scrappy/extractor/selectors/css.rb +5 -0
- data/lib/scrappy/{selectors → extractor/selectors}/new_uri.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/root.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/section.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/slice.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/uri.rb +2 -4
- data/lib/scrappy/{selectors → extractor/selectors}/uri_pattern.rb +2 -4
- data/lib/scrappy/extractor/selectors/visual.rb +39 -0
- data/lib/scrappy/{selectors → extractor/selectors}/xpath.rb +1 -4
- data/lib/scrappy/server/admin.rb +89 -2
- data/lib/scrappy/server/helpers.rb +11 -2
- data/lib/scrappy/server/server.rb +1 -0
- data/lib/scrappy/trainer/trainer.rb +101 -0
- data/public/javascripts/annotator.js +75 -0
- data/public/javascripts/remote.js +132 -0
- data/public/stylesheets/application.css +39 -12
- data/scrappy.gemspec +13 -11
- data/views/extractors.haml +24 -0
- data/views/layout.haml +14 -4
- data/views/patterns.haml +19 -0
- data/views/samples.haml +28 -0
- metadata +58 -56
- data/lib/scrappy/agent/extractor.rb +0 -196
- data/lib/scrappy/selectors/css.rb +0 -10
- data/public/javascripts/scrappy.js +0 -65
- data/views/kb.haml +0 -15
data/views/samples.haml
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Sample pages
|
3
|
+
%p
|
4
|
+
Sample pages are used to build extractors as well as visual patterns that can be applied to retrieve data
|
5
|
+
from other pages.
|
6
|
+
%p
|
7
|
+
-if @samples.empty?
|
8
|
+
Currently, there are no samples.
|
9
|
+
-else
|
10
|
+
%ul.detail
|
11
|
+
-@samples.each_with_index do |sample,i|
|
12
|
+
%li
|
13
|
+
%span.action
|
14
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the sample #{sample[:uri]}?"}
|
15
|
+
X
|
16
|
+
%span.short_name
|
17
|
+
-if !sample[:uri].include?('*')
|
18
|
+
%a{:href=>sample[:uri]}=sample[:uri]
|
19
|
+
-else
|
20
|
+
=sample[:uri]
|
21
|
+
-[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
|
22
|
+
%span.format
|
23
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
|
24
|
+
%span.format
|
25
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
|
26
|
+
%span.date
|
27
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}"}
|
28
|
+
=sample[:date].strftime("%Y/%m/%d - %H:%M")
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrappy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 19
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
8
|
+
- 1
|
9
|
+
version: 0.3.1
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Jose Ignacio
|
@@ -15,18 +14,16 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-18 00:00:00 +01:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: activesupport
|
23
22
|
prerelease: false
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
24
|
requirements:
|
27
25
|
- - ">="
|
28
26
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 9
|
30
27
|
segments:
|
31
28
|
- 2
|
32
29
|
- 3
|
@@ -38,11 +35,9 @@ dependencies:
|
|
38
35
|
name: sinatra
|
39
36
|
prerelease: false
|
40
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
38
|
requirements:
|
43
39
|
- - ">="
|
44
40
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 23
|
46
41
|
segments:
|
47
42
|
- 1
|
48
43
|
- 1
|
@@ -54,11 +49,9 @@ dependencies:
|
|
54
49
|
name: thin
|
55
50
|
prerelease: false
|
56
51
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
52
|
requirements:
|
59
53
|
- - ">="
|
60
54
|
- !ruby/object:Gem::Version
|
61
|
-
hash: 17
|
62
55
|
segments:
|
63
56
|
- 1
|
64
57
|
- 2
|
@@ -70,11 +63,9 @@ dependencies:
|
|
70
63
|
name: nokogiri
|
71
64
|
prerelease: false
|
72
65
|
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
66
|
requirements:
|
75
67
|
- - ">="
|
76
68
|
- !ruby/object:Gem::Version
|
77
|
-
hash: 5
|
78
69
|
segments:
|
79
70
|
- 1
|
80
71
|
- 4
|
@@ -86,11 +77,9 @@ dependencies:
|
|
86
77
|
name: mechanize
|
87
78
|
prerelease: false
|
88
79
|
requirement: &id005 !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
80
|
requirements:
|
91
81
|
- - ">="
|
92
82
|
- !ruby/object:Gem::Version
|
93
|
-
hash: 23
|
94
83
|
segments:
|
95
84
|
- 1
|
96
85
|
- 0
|
@@ -102,27 +91,23 @@ dependencies:
|
|
102
91
|
name: lightrdf
|
103
92
|
prerelease: false
|
104
93
|
requirement: &id006 !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
94
|
requirements:
|
107
95
|
- - ">="
|
108
96
|
- !ruby/object:Gem::Version
|
109
|
-
hash: 21
|
110
97
|
segments:
|
111
98
|
- 0
|
112
|
-
-
|
113
|
-
-
|
114
|
-
version: 0.
|
99
|
+
- 3
|
100
|
+
- 0
|
101
|
+
version: 0.3.0
|
115
102
|
type: :runtime
|
116
103
|
version_requirements: *id006
|
117
104
|
- !ruby/object:Gem::Dependency
|
118
105
|
name: i18n
|
119
106
|
prerelease: false
|
120
107
|
requirement: &id007 !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
108
|
requirements:
|
123
109
|
- - ">="
|
124
110
|
- !ruby/object:Gem::Version
|
125
|
-
hash: 11
|
126
111
|
segments:
|
127
112
|
- 0
|
128
113
|
- 4
|
@@ -134,11 +119,9 @@ dependencies:
|
|
134
119
|
name: rest-client
|
135
120
|
prerelease: false
|
136
121
|
requirement: &id008 !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
122
|
requirements:
|
139
123
|
- - ">="
|
140
124
|
- !ruby/object:Gem::Version
|
141
|
-
hash: 13
|
142
125
|
segments:
|
143
126
|
- 1
|
144
127
|
- 6
|
@@ -150,11 +133,9 @@ dependencies:
|
|
150
133
|
name: haml
|
151
134
|
prerelease: false
|
152
135
|
requirement: &id009 !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
136
|
requirements:
|
155
137
|
- - ">="
|
156
138
|
- !ruby/object:Gem::Version
|
157
|
-
hash: 55
|
158
139
|
segments:
|
159
140
|
- 3
|
160
141
|
- 0
|
@@ -162,6 +143,20 @@ dependencies:
|
|
162
143
|
version: 3.0.24
|
163
144
|
type: :runtime
|
164
145
|
version_requirements: *id009
|
146
|
+
- !ruby/object:Gem::Dependency
|
147
|
+
name: rack-flash
|
148
|
+
prerelease: false
|
149
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - ">="
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
segments:
|
154
|
+
- 0
|
155
|
+
- 1
|
156
|
+
- 1
|
157
|
+
version: 0.1.1
|
158
|
+
type: :runtime
|
159
|
+
version_requirements: *id010
|
165
160
|
description: RDF web scraper
|
166
161
|
email: joseignacio.fernandez@gmail.com
|
167
162
|
executables:
|
@@ -176,24 +171,28 @@ extra_rdoc_files:
|
|
176
171
|
- lib/scrappy/agent/blind_agent.rb
|
177
172
|
- lib/scrappy/agent/cache.rb
|
178
173
|
- lib/scrappy/agent/dumper.rb
|
179
|
-
- lib/scrappy/agent/extractor.rb
|
180
|
-
- lib/scrappy/agent/formats.rb
|
181
174
|
- lib/scrappy/agent/map_reduce.rb
|
175
|
+
- lib/scrappy/extractor/extractor.rb
|
176
|
+
- lib/scrappy/extractor/formats.rb
|
177
|
+
- lib/scrappy/extractor/fragment.rb
|
178
|
+
- lib/scrappy/extractor/selector.rb
|
179
|
+
- lib/scrappy/extractor/selectors/base_uri.rb
|
180
|
+
- lib/scrappy/extractor/selectors/css.rb
|
181
|
+
- lib/scrappy/extractor/selectors/new_uri.rb
|
182
|
+
- lib/scrappy/extractor/selectors/root.rb
|
183
|
+
- lib/scrappy/extractor/selectors/section.rb
|
184
|
+
- lib/scrappy/extractor/selectors/slice.rb
|
185
|
+
- lib/scrappy/extractor/selectors/uri.rb
|
186
|
+
- lib/scrappy/extractor/selectors/uri_pattern.rb
|
187
|
+
- lib/scrappy/extractor/selectors/visual.rb
|
188
|
+
- lib/scrappy/extractor/selectors/xpath.rb
|
182
189
|
- lib/scrappy/repository.rb
|
183
|
-
- lib/scrappy/selectors/base_uri.rb
|
184
|
-
- lib/scrappy/selectors/css.rb
|
185
|
-
- lib/scrappy/selectors/new_uri.rb
|
186
|
-
- lib/scrappy/selectors/root.rb
|
187
|
-
- lib/scrappy/selectors/section.rb
|
188
|
-
- lib/scrappy/selectors/slice.rb
|
189
|
-
- lib/scrappy/selectors/uri.rb
|
190
|
-
- lib/scrappy/selectors/uri_pattern.rb
|
191
|
-
- lib/scrappy/selectors/xpath.rb
|
192
190
|
- lib/scrappy/server/admin.rb
|
193
191
|
- lib/scrappy/server/errors.rb
|
194
192
|
- lib/scrappy/server/helpers.rb
|
195
193
|
- lib/scrappy/server/server.rb
|
196
194
|
- lib/scrappy/support.rb
|
195
|
+
- lib/scrappy/trainer/trainer.rb
|
197
196
|
files:
|
198
197
|
- History.txt
|
199
198
|
- Manifest
|
@@ -206,41 +205,48 @@ files:
|
|
206
205
|
- lib/scrappy/agent/blind_agent.rb
|
207
206
|
- lib/scrappy/agent/cache.rb
|
208
207
|
- lib/scrappy/agent/dumper.rb
|
209
|
-
- lib/scrappy/agent/extractor.rb
|
210
|
-
- lib/scrappy/agent/formats.rb
|
211
208
|
- lib/scrappy/agent/map_reduce.rb
|
209
|
+
- lib/scrappy/extractor/extractor.rb
|
210
|
+
- lib/scrappy/extractor/formats.rb
|
211
|
+
- lib/scrappy/extractor/fragment.rb
|
212
|
+
- lib/scrappy/extractor/selector.rb
|
213
|
+
- lib/scrappy/extractor/selectors/base_uri.rb
|
214
|
+
- lib/scrappy/extractor/selectors/css.rb
|
215
|
+
- lib/scrappy/extractor/selectors/new_uri.rb
|
216
|
+
- lib/scrappy/extractor/selectors/root.rb
|
217
|
+
- lib/scrappy/extractor/selectors/section.rb
|
218
|
+
- lib/scrappy/extractor/selectors/slice.rb
|
219
|
+
- lib/scrappy/extractor/selectors/uri.rb
|
220
|
+
- lib/scrappy/extractor/selectors/uri_pattern.rb
|
221
|
+
- lib/scrappy/extractor/selectors/visual.rb
|
222
|
+
- lib/scrappy/extractor/selectors/xpath.rb
|
212
223
|
- lib/scrappy/repository.rb
|
213
|
-
- lib/scrappy/selectors/base_uri.rb
|
214
|
-
- lib/scrappy/selectors/css.rb
|
215
|
-
- lib/scrappy/selectors/new_uri.rb
|
216
|
-
- lib/scrappy/selectors/root.rb
|
217
|
-
- lib/scrappy/selectors/section.rb
|
218
|
-
- lib/scrappy/selectors/slice.rb
|
219
|
-
- lib/scrappy/selectors/uri.rb
|
220
|
-
- lib/scrappy/selectors/uri_pattern.rb
|
221
|
-
- lib/scrappy/selectors/xpath.rb
|
222
224
|
- lib/scrappy/server/admin.rb
|
223
225
|
- lib/scrappy/server/errors.rb
|
224
226
|
- lib/scrappy/server/helpers.rb
|
225
227
|
- lib/scrappy/server/server.rb
|
226
228
|
- lib/scrappy/support.rb
|
229
|
+
- lib/scrappy/trainer/trainer.rb
|
227
230
|
- public/favicon.ico
|
228
231
|
- public/images/logo.png
|
229
232
|
- public/images/logo_tiny.png
|
230
|
-
- public/javascripts/
|
233
|
+
- public/javascripts/annotator.js
|
234
|
+
- public/javascripts/remote.js
|
231
235
|
- public/stylesheets/application.css
|
232
236
|
- test/test_helper.rb
|
233
237
|
- test/test_scrappy.rb
|
238
|
+
- views/extractors.haml
|
234
239
|
- views/help.haml
|
235
240
|
- views/home.haml
|
236
|
-
- views/kb.haml
|
237
241
|
- views/layout.haml
|
242
|
+
- views/patterns.haml
|
243
|
+
- views/samples.haml
|
238
244
|
- scrappy.gemspec
|
239
245
|
has_rdoc: true
|
240
246
|
homepage: http://github.com/josei/scrappy
|
241
247
|
licenses: []
|
242
248
|
|
243
|
-
post_install_message:
|
249
|
+
post_install_message:
|
244
250
|
rdoc_options:
|
245
251
|
- --line-numbers
|
246
252
|
- --inline-source
|
@@ -251,20 +257,16 @@ rdoc_options:
|
|
251
257
|
require_paths:
|
252
258
|
- lib
|
253
259
|
required_ruby_version: !ruby/object:Gem::Requirement
|
254
|
-
none: false
|
255
260
|
requirements:
|
256
261
|
- - ">="
|
257
262
|
- !ruby/object:Gem::Version
|
258
|
-
hash: 3
|
259
263
|
segments:
|
260
264
|
- 0
|
261
265
|
version: "0"
|
262
266
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
263
|
-
none: false
|
264
267
|
requirements:
|
265
268
|
- - ">="
|
266
269
|
- !ruby/object:Gem::Version
|
267
|
-
hash: 11
|
268
270
|
segments:
|
269
271
|
- 1
|
270
272
|
- 2
|
@@ -272,10 +274,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
272
274
|
requirements: []
|
273
275
|
|
274
276
|
rubyforge_project: scrappy
|
275
|
-
rubygems_version: 1.3.
|
277
|
+
rubygems_version: 1.3.6
|
276
278
|
signing_key:
|
277
279
|
specification_version: 3
|
278
280
|
summary: Web scraper that allows producing RDF data out of plain web pages
|
279
281
|
test_files:
|
280
|
-
- test/test_helper.rb
|
281
282
|
- test/test_scrappy.rb
|
283
|
+
- test/test_helper.rb
|
@@ -1,196 +0,0 @@
|
|
1
|
-
require 'digest/md5'
|
2
|
-
|
3
|
-
module Scrappy
|
4
|
-
module Extractor
|
5
|
-
def extract uri, html, referenceable=nil
|
6
|
-
if options.debug
|
7
|
-
print "Extracting #{uri}..."; $stdout.flush
|
8
|
-
end
|
9
|
-
|
10
|
-
@selector_pool ||= {}
|
11
|
-
triples = []
|
12
|
-
content = Nokogiri::HTML(html, nil, 'utf-8')
|
13
|
-
|
14
|
-
uri_selectors = (kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector'))).flatten.select do |uri_selector|
|
15
|
-
results = selector_pool(uri_selector).filter :content=>content, :uri=>uri
|
16
|
-
!results.empty?
|
17
|
-
end
|
18
|
-
|
19
|
-
fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
|
20
|
-
|
21
|
-
fragments.each do |fragment|
|
22
|
-
extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
|
23
|
-
:parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
|
24
|
-
end
|
25
|
-
|
26
|
-
add_referenceable_data content, triples, referenceable if referenceable
|
27
|
-
|
28
|
-
puts "done!" if options.debug
|
29
|
-
|
30
|
-
triples.map do |s,p,o|
|
31
|
-
[ s.is_a?(RDF::Node) ? s.id : s,
|
32
|
-
p.is_a?(RDF::Node) ? p.id : p,
|
33
|
-
o.is_a?(RDF::Node) ? o.id : o ]
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
def extract_fragment fragment, options={}
|
39
|
-
node = Node(options[:parent])
|
40
|
-
uri = options[:doc][:uri]
|
41
|
-
|
42
|
-
# Select nodes
|
43
|
-
docs = fragment.sc::selector.map { |s| filter s, options[:doc] }.flatten
|
44
|
-
|
45
|
-
# Generate triples
|
46
|
-
docs.each do |doc|
|
47
|
-
# Build URIs if identifier present
|
48
|
-
nodes = fragment.sc::identifier.map { |s| filter s, doc }.flatten.map do |d|
|
49
|
-
node = Node(parse_uri(uri, d[:value]))
|
50
|
-
if options[:referenceable]
|
51
|
-
# Include the fragment where the URI was built from
|
52
|
-
uri_node = Node(nil)
|
53
|
-
options[:triples] << [ node, Node("sc:uri"), uri_node ]
|
54
|
-
options[:triples] << [ uri_node, Node("rdf:value"), node.to_s ]
|
55
|
-
options[:triples] << [ uri_node, Node("sc:source"), Node(node_hash(d[:uri], d[:content].path)) ]
|
56
|
-
end
|
57
|
-
node
|
58
|
-
end
|
59
|
-
nodes << Node(nil) if nodes.empty?
|
60
|
-
|
61
|
-
nodes.each do |node|
|
62
|
-
# Build the object
|
63
|
-
object = if fragment.sc::type.include?(Node('rdf:Literal'))
|
64
|
-
value = doc[:value].to_s.strip
|
65
|
-
if options[:referenceable]
|
66
|
-
bnode = Node(nil)
|
67
|
-
bnode.rdf::value = value
|
68
|
-
bnode.rdf::type = Node('rdf:Literal')
|
69
|
-
options[:triples].push *bnode.triples
|
70
|
-
bnode
|
71
|
-
else
|
72
|
-
value
|
73
|
-
end
|
74
|
-
else
|
75
|
-
fragment.sc::type.each { |type| options[:triples] << [node, Node('rdf:type'), type] if type != Node('rdf:Resource') }
|
76
|
-
fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
|
77
|
-
fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
|
78
|
-
node
|
79
|
-
end
|
80
|
-
fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
|
81
|
-
|
82
|
-
# Add referenceable data if requested
|
83
|
-
if options[:referenceable]
|
84
|
-
sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
|
85
|
-
sources.each do |source|
|
86
|
-
options[:triples] << [ object, Node("sc:source"), source ]
|
87
|
-
fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
|
88
|
-
fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# Process subfragments
|
93
|
-
fragment.sc::subfragment.each { |subfragment| extract_fragment subfragment, options.merge(:doc=>doc, :parent=>object) }
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def filter selector, doc
|
99
|
-
if selector.sc::debug.first=="true" and options.debug
|
100
|
-
puts '== DEBUG'
|
101
|
-
puts '== Selector:'
|
102
|
-
puts selector.serialize(:yarf, false)
|
103
|
-
puts '== On fragment:'
|
104
|
-
puts "URI: #{doc[:uri]}"
|
105
|
-
puts "Content: #{doc[:content]}"
|
106
|
-
puts "Value: #{doc[:value]}"
|
107
|
-
end
|
108
|
-
|
109
|
-
# Process selector
|
110
|
-
results = selector_pool(selector).filter doc
|
111
|
-
|
112
|
-
if selector.sc::debug.first=="true" and options.debug
|
113
|
-
puts "== No results" if results.empty?
|
114
|
-
results.each_with_index do |result, i|
|
115
|
-
puts "== Result ##{i}:"
|
116
|
-
puts "URI: #{result[:uri]}"
|
117
|
-
puts "Content: #{result[:content]}"
|
118
|
-
puts "Value: #{result[:value].inspect}"
|
119
|
-
end
|
120
|
-
puts
|
121
|
-
end
|
122
|
-
|
123
|
-
# Return results if no nested selectors
|
124
|
-
return results if selector.sc::selector.empty?
|
125
|
-
|
126
|
-
# Process nested selectors
|
127
|
-
results.map do |result|
|
128
|
-
selector.sc::selector.map { |s| filter s, result }
|
129
|
-
end.flatten
|
130
|
-
end
|
131
|
-
|
132
|
-
def parse_uri(uri, rel_uri)
|
133
|
-
return ID('*') if rel_uri.nil?
|
134
|
-
begin
|
135
|
-
ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri).to_s)
|
136
|
-
rescue
|
137
|
-
ID('*')
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def add_referenceable_data content, triples, referenceable
|
142
|
-
resources = {}; triples.each { |s,p,o| resources[o] = true }
|
143
|
-
|
144
|
-
fragment = Node(node_hash(uri, '/'))
|
145
|
-
selector = Node(nil)
|
146
|
-
presentation = Node(nil)
|
147
|
-
|
148
|
-
selector.rdf::type = Node('sc:UnivocalSelector')
|
149
|
-
selector.sc::path = '/'
|
150
|
-
selector.sc::document = uri
|
151
|
-
|
152
|
-
fragment.sc::selector = selector
|
153
|
-
|
154
|
-
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
|
155
|
-
|
156
|
-
content.search('*').each do |node|
|
157
|
-
next if node.text?
|
158
|
-
|
159
|
-
fragment = Node(node_hash(uri, node.path))
|
160
|
-
|
161
|
-
if referenceable == :dump or resources[fragment]
|
162
|
-
selector = Node(nil)
|
163
|
-
presentation = Node(nil)
|
164
|
-
|
165
|
-
triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
|
166
|
-
triples << [selector, ID('sc:path'), node.path.to_s]
|
167
|
-
triples << [selector, ID('sc:tag'), node.name.to_s]
|
168
|
-
triples << [selector, ID('sc:document'), uri]
|
169
|
-
|
170
|
-
triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
|
171
|
-
triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
|
172
|
-
triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
|
173
|
-
triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
|
174
|
-
triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
|
175
|
-
triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
|
176
|
-
triples << [presentation, ID('sc:color'), node[:vcolor].to_s] if node[:vcolor]
|
177
|
-
triples << [presentation, ID('sc:background_color'), node[:vbcolor].to_s] if node[:vbcolor]
|
178
|
-
triples << [presentation, ID('sc:text'), node.text.strip]
|
179
|
-
triples << [presentation, ID('sc:children_count'), node.children.select{|n| !n.text?}.size.to_s]
|
180
|
-
|
181
|
-
triples << [fragment, ID('sc:selector'), selector]
|
182
|
-
triples << [fragment, ID('sc:presentation'), presentation]
|
183
|
-
end
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
187
|
-
def node_hash uri, path
|
188
|
-
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
189
|
-
:"_:bnode#{digest}"
|
190
|
-
end
|
191
|
-
|
192
|
-
def selector_pool selector
|
193
|
-
@selector_pool[selector.id] ||= kb.node(selector)
|
194
|
-
end
|
195
|
-
end
|
196
|
-
end
|