scrappy 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest +21 -14
- data/README.rdoc +5 -9
- data/Rakefile +1 -2
- data/bin/scrappy +141 -51
- data/lib/scrappy.rb +6 -9
- data/lib/scrappy/agent/agent.rb +3 -3
- data/lib/scrappy/extractor/extractor.rb +108 -0
- data/lib/scrappy/{agent → extractor}/formats.rb +0 -0
- data/lib/scrappy/extractor/fragment.rb +111 -0
- data/lib/scrappy/extractor/selector.rb +41 -0
- data/lib/scrappy/{selectors → extractor/selectors}/base_uri.rb +1 -3
- data/lib/scrappy/extractor/selectors/css.rb +5 -0
- data/lib/scrappy/{selectors → extractor/selectors}/new_uri.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/root.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/section.rb +1 -4
- data/lib/scrappy/{selectors → extractor/selectors}/slice.rb +1 -3
- data/lib/scrappy/{selectors → extractor/selectors}/uri.rb +2 -4
- data/lib/scrappy/{selectors → extractor/selectors}/uri_pattern.rb +2 -4
- data/lib/scrappy/extractor/selectors/visual.rb +39 -0
- data/lib/scrappy/{selectors → extractor/selectors}/xpath.rb +1 -4
- data/lib/scrappy/server/admin.rb +89 -2
- data/lib/scrappy/server/helpers.rb +11 -2
- data/lib/scrappy/server/server.rb +1 -0
- data/lib/scrappy/trainer/trainer.rb +101 -0
- data/public/javascripts/annotator.js +75 -0
- data/public/javascripts/remote.js +132 -0
- data/public/stylesheets/application.css +39 -12
- data/scrappy.gemspec +13 -11
- data/views/extractors.haml +24 -0
- data/views/layout.haml +14 -4
- data/views/patterns.haml +19 -0
- data/views/samples.haml +28 -0
- metadata +58 -56
- data/lib/scrappy/agent/extractor.rb +0 -196
- data/lib/scrappy/selectors/css.rb +0 -10
- data/public/javascripts/scrappy.js +0 -65
- data/views/kb.haml +0 -15
data/views/samples.haml
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#body
|
2
|
+
%h1 Sample pages
|
3
|
+
%p
|
4
|
+
Sample pages are used to build extractors as well as visual patterns that can be applied to retrieve data
|
5
|
+
from other pages.
|
6
|
+
%p
|
7
|
+
-if @samples.empty?
|
8
|
+
Currently, there are no samples.
|
9
|
+
-else
|
10
|
+
%ul.detail
|
11
|
+
-@samples.each_with_index do |sample,i|
|
12
|
+
%li
|
13
|
+
%span.action
|
14
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}", :'data-method'=>:delete, :'data-confirm'=>"Are you sure you want to delete the sample #{sample[:uri]}?"}
|
15
|
+
X
|
16
|
+
%span.short_name
|
17
|
+
-if !sample[:uri].include?('*')
|
18
|
+
%a{:href=>sample[:uri]}=sample[:uri]
|
19
|
+
-else
|
20
|
+
=sample[:uri]
|
21
|
+
-[['Patterns output', :patterns], ['Extractors output', :extractors]].reverse.each do |text, action|
|
22
|
+
%span.format
|
23
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/#{action}"}=text
|
24
|
+
%span.format
|
25
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}/train", :'data-method'=>:post} Train
|
26
|
+
%span.date
|
27
|
+
%a{:href=>"#{settings.base_uri}/samples/#{i}"}
|
28
|
+
=sample[:date].strftime("%Y/%m/%d - %H:%M")
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrappy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 19
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 3
|
9
|
-
-
|
10
|
-
version: 0.3.
|
8
|
+
- 1
|
9
|
+
version: 0.3.1
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Jose Ignacio
|
@@ -15,18 +14,16 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-18 00:00:00 +01:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: activesupport
|
23
22
|
prerelease: false
|
24
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
24
|
requirements:
|
27
25
|
- - ">="
|
28
26
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 9
|
30
27
|
segments:
|
31
28
|
- 2
|
32
29
|
- 3
|
@@ -38,11 +35,9 @@ dependencies:
|
|
38
35
|
name: sinatra
|
39
36
|
prerelease: false
|
40
37
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
38
|
requirements:
|
43
39
|
- - ">="
|
44
40
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 23
|
46
41
|
segments:
|
47
42
|
- 1
|
48
43
|
- 1
|
@@ -54,11 +49,9 @@ dependencies:
|
|
54
49
|
name: thin
|
55
50
|
prerelease: false
|
56
51
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
52
|
requirements:
|
59
53
|
- - ">="
|
60
54
|
- !ruby/object:Gem::Version
|
61
|
-
hash: 17
|
62
55
|
segments:
|
63
56
|
- 1
|
64
57
|
- 2
|
@@ -70,11 +63,9 @@ dependencies:
|
|
70
63
|
name: nokogiri
|
71
64
|
prerelease: false
|
72
65
|
requirement: &id004 !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
66
|
requirements:
|
75
67
|
- - ">="
|
76
68
|
- !ruby/object:Gem::Version
|
77
|
-
hash: 5
|
78
69
|
segments:
|
79
70
|
- 1
|
80
71
|
- 4
|
@@ -86,11 +77,9 @@ dependencies:
|
|
86
77
|
name: mechanize
|
87
78
|
prerelease: false
|
88
79
|
requirement: &id005 !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
80
|
requirements:
|
91
81
|
- - ">="
|
92
82
|
- !ruby/object:Gem::Version
|
93
|
-
hash: 23
|
94
83
|
segments:
|
95
84
|
- 1
|
96
85
|
- 0
|
@@ -102,27 +91,23 @@ dependencies:
|
|
102
91
|
name: lightrdf
|
103
92
|
prerelease: false
|
104
93
|
requirement: &id006 !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
94
|
requirements:
|
107
95
|
- - ">="
|
108
96
|
- !ruby/object:Gem::Version
|
109
|
-
hash: 21
|
110
97
|
segments:
|
111
98
|
- 0
|
112
|
-
-
|
113
|
-
-
|
114
|
-
version: 0.
|
99
|
+
- 3
|
100
|
+
- 0
|
101
|
+
version: 0.3.0
|
115
102
|
type: :runtime
|
116
103
|
version_requirements: *id006
|
117
104
|
- !ruby/object:Gem::Dependency
|
118
105
|
name: i18n
|
119
106
|
prerelease: false
|
120
107
|
requirement: &id007 !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
108
|
requirements:
|
123
109
|
- - ">="
|
124
110
|
- !ruby/object:Gem::Version
|
125
|
-
hash: 11
|
126
111
|
segments:
|
127
112
|
- 0
|
128
113
|
- 4
|
@@ -134,11 +119,9 @@ dependencies:
|
|
134
119
|
name: rest-client
|
135
120
|
prerelease: false
|
136
121
|
requirement: &id008 !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
122
|
requirements:
|
139
123
|
- - ">="
|
140
124
|
- !ruby/object:Gem::Version
|
141
|
-
hash: 13
|
142
125
|
segments:
|
143
126
|
- 1
|
144
127
|
- 6
|
@@ -150,11 +133,9 @@ dependencies:
|
|
150
133
|
name: haml
|
151
134
|
prerelease: false
|
152
135
|
requirement: &id009 !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
136
|
requirements:
|
155
137
|
- - ">="
|
156
138
|
- !ruby/object:Gem::Version
|
157
|
-
hash: 55
|
158
139
|
segments:
|
159
140
|
- 3
|
160
141
|
- 0
|
@@ -162,6 +143,20 @@ dependencies:
|
|
162
143
|
version: 3.0.24
|
163
144
|
type: :runtime
|
164
145
|
version_requirements: *id009
|
146
|
+
- !ruby/object:Gem::Dependency
|
147
|
+
name: rack-flash
|
148
|
+
prerelease: false
|
149
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - ">="
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
segments:
|
154
|
+
- 0
|
155
|
+
- 1
|
156
|
+
- 1
|
157
|
+
version: 0.1.1
|
158
|
+
type: :runtime
|
159
|
+
version_requirements: *id010
|
165
160
|
description: RDF web scraper
|
166
161
|
email: joseignacio.fernandez@gmail.com
|
167
162
|
executables:
|
@@ -176,24 +171,28 @@ extra_rdoc_files:
|
|
176
171
|
- lib/scrappy/agent/blind_agent.rb
|
177
172
|
- lib/scrappy/agent/cache.rb
|
178
173
|
- lib/scrappy/agent/dumper.rb
|
179
|
-
- lib/scrappy/agent/extractor.rb
|
180
|
-
- lib/scrappy/agent/formats.rb
|
181
174
|
- lib/scrappy/agent/map_reduce.rb
|
175
|
+
- lib/scrappy/extractor/extractor.rb
|
176
|
+
- lib/scrappy/extractor/formats.rb
|
177
|
+
- lib/scrappy/extractor/fragment.rb
|
178
|
+
- lib/scrappy/extractor/selector.rb
|
179
|
+
- lib/scrappy/extractor/selectors/base_uri.rb
|
180
|
+
- lib/scrappy/extractor/selectors/css.rb
|
181
|
+
- lib/scrappy/extractor/selectors/new_uri.rb
|
182
|
+
- lib/scrappy/extractor/selectors/root.rb
|
183
|
+
- lib/scrappy/extractor/selectors/section.rb
|
184
|
+
- lib/scrappy/extractor/selectors/slice.rb
|
185
|
+
- lib/scrappy/extractor/selectors/uri.rb
|
186
|
+
- lib/scrappy/extractor/selectors/uri_pattern.rb
|
187
|
+
- lib/scrappy/extractor/selectors/visual.rb
|
188
|
+
- lib/scrappy/extractor/selectors/xpath.rb
|
182
189
|
- lib/scrappy/repository.rb
|
183
|
-
- lib/scrappy/selectors/base_uri.rb
|
184
|
-
- lib/scrappy/selectors/css.rb
|
185
|
-
- lib/scrappy/selectors/new_uri.rb
|
186
|
-
- lib/scrappy/selectors/root.rb
|
187
|
-
- lib/scrappy/selectors/section.rb
|
188
|
-
- lib/scrappy/selectors/slice.rb
|
189
|
-
- lib/scrappy/selectors/uri.rb
|
190
|
-
- lib/scrappy/selectors/uri_pattern.rb
|
191
|
-
- lib/scrappy/selectors/xpath.rb
|
192
190
|
- lib/scrappy/server/admin.rb
|
193
191
|
- lib/scrappy/server/errors.rb
|
194
192
|
- lib/scrappy/server/helpers.rb
|
195
193
|
- lib/scrappy/server/server.rb
|
196
194
|
- lib/scrappy/support.rb
|
195
|
+
- lib/scrappy/trainer/trainer.rb
|
197
196
|
files:
|
198
197
|
- History.txt
|
199
198
|
- Manifest
|
@@ -206,41 +205,48 @@ files:
|
|
206
205
|
- lib/scrappy/agent/blind_agent.rb
|
207
206
|
- lib/scrappy/agent/cache.rb
|
208
207
|
- lib/scrappy/agent/dumper.rb
|
209
|
-
- lib/scrappy/agent/extractor.rb
|
210
|
-
- lib/scrappy/agent/formats.rb
|
211
208
|
- lib/scrappy/agent/map_reduce.rb
|
209
|
+
- lib/scrappy/extractor/extractor.rb
|
210
|
+
- lib/scrappy/extractor/formats.rb
|
211
|
+
- lib/scrappy/extractor/fragment.rb
|
212
|
+
- lib/scrappy/extractor/selector.rb
|
213
|
+
- lib/scrappy/extractor/selectors/base_uri.rb
|
214
|
+
- lib/scrappy/extractor/selectors/css.rb
|
215
|
+
- lib/scrappy/extractor/selectors/new_uri.rb
|
216
|
+
- lib/scrappy/extractor/selectors/root.rb
|
217
|
+
- lib/scrappy/extractor/selectors/section.rb
|
218
|
+
- lib/scrappy/extractor/selectors/slice.rb
|
219
|
+
- lib/scrappy/extractor/selectors/uri.rb
|
220
|
+
- lib/scrappy/extractor/selectors/uri_pattern.rb
|
221
|
+
- lib/scrappy/extractor/selectors/visual.rb
|
222
|
+
- lib/scrappy/extractor/selectors/xpath.rb
|
212
223
|
- lib/scrappy/repository.rb
|
213
|
-
- lib/scrappy/selectors/base_uri.rb
|
214
|
-
- lib/scrappy/selectors/css.rb
|
215
|
-
- lib/scrappy/selectors/new_uri.rb
|
216
|
-
- lib/scrappy/selectors/root.rb
|
217
|
-
- lib/scrappy/selectors/section.rb
|
218
|
-
- lib/scrappy/selectors/slice.rb
|
219
|
-
- lib/scrappy/selectors/uri.rb
|
220
|
-
- lib/scrappy/selectors/uri_pattern.rb
|
221
|
-
- lib/scrappy/selectors/xpath.rb
|
222
224
|
- lib/scrappy/server/admin.rb
|
223
225
|
- lib/scrappy/server/errors.rb
|
224
226
|
- lib/scrappy/server/helpers.rb
|
225
227
|
- lib/scrappy/server/server.rb
|
226
228
|
- lib/scrappy/support.rb
|
229
|
+
- lib/scrappy/trainer/trainer.rb
|
227
230
|
- public/favicon.ico
|
228
231
|
- public/images/logo.png
|
229
232
|
- public/images/logo_tiny.png
|
230
|
-
- public/javascripts/
|
233
|
+
- public/javascripts/annotator.js
|
234
|
+
- public/javascripts/remote.js
|
231
235
|
- public/stylesheets/application.css
|
232
236
|
- test/test_helper.rb
|
233
237
|
- test/test_scrappy.rb
|
238
|
+
- views/extractors.haml
|
234
239
|
- views/help.haml
|
235
240
|
- views/home.haml
|
236
|
-
- views/kb.haml
|
237
241
|
- views/layout.haml
|
242
|
+
- views/patterns.haml
|
243
|
+
- views/samples.haml
|
238
244
|
- scrappy.gemspec
|
239
245
|
has_rdoc: true
|
240
246
|
homepage: http://github.com/josei/scrappy
|
241
247
|
licenses: []
|
242
248
|
|
243
|
-
post_install_message:
|
249
|
+
post_install_message:
|
244
250
|
rdoc_options:
|
245
251
|
- --line-numbers
|
246
252
|
- --inline-source
|
@@ -251,20 +257,16 @@ rdoc_options:
|
|
251
257
|
require_paths:
|
252
258
|
- lib
|
253
259
|
required_ruby_version: !ruby/object:Gem::Requirement
|
254
|
-
none: false
|
255
260
|
requirements:
|
256
261
|
- - ">="
|
257
262
|
- !ruby/object:Gem::Version
|
258
|
-
hash: 3
|
259
263
|
segments:
|
260
264
|
- 0
|
261
265
|
version: "0"
|
262
266
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
263
|
-
none: false
|
264
267
|
requirements:
|
265
268
|
- - ">="
|
266
269
|
- !ruby/object:Gem::Version
|
267
|
-
hash: 11
|
268
270
|
segments:
|
269
271
|
- 1
|
270
272
|
- 2
|
@@ -272,10 +274,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
272
274
|
requirements: []
|
273
275
|
|
274
276
|
rubyforge_project: scrappy
|
275
|
-
rubygems_version: 1.3.
|
277
|
+
rubygems_version: 1.3.6
|
276
278
|
signing_key:
|
277
279
|
specification_version: 3
|
278
280
|
summary: Web scraper that allows producing RDF data out of plain web pages
|
279
281
|
test_files:
|
280
|
-
- test/test_helper.rb
|
281
282
|
- test/test_scrappy.rb
|
283
|
+
- test/test_helper.rb
|
@@ -1,196 +0,0 @@
|
|
1
|
-
require 'digest/md5'
|
2
|
-
|
3
|
-
module Scrappy
|
4
|
-
module Extractor
|
5
|
-
def extract uri, html, referenceable=nil
|
6
|
-
if options.debug
|
7
|
-
print "Extracting #{uri}..."; $stdout.flush
|
8
|
-
end
|
9
|
-
|
10
|
-
@selector_pool ||= {}
|
11
|
-
triples = []
|
12
|
-
content = Nokogiri::HTML(html, nil, 'utf-8')
|
13
|
-
|
14
|
-
uri_selectors = (kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector'))).flatten.select do |uri_selector|
|
15
|
-
results = selector_pool(uri_selector).filter :content=>content, :uri=>uri
|
16
|
-
!results.empty?
|
17
|
-
end
|
18
|
-
|
19
|
-
fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
|
20
|
-
|
21
|
-
fragments.each do |fragment|
|
22
|
-
extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
|
23
|
-
:parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
|
24
|
-
end
|
25
|
-
|
26
|
-
add_referenceable_data content, triples, referenceable if referenceable
|
27
|
-
|
28
|
-
puts "done!" if options.debug
|
29
|
-
|
30
|
-
triples.map do |s,p,o|
|
31
|
-
[ s.is_a?(RDF::Node) ? s.id : s,
|
32
|
-
p.is_a?(RDF::Node) ? p.id : p,
|
33
|
-
o.is_a?(RDF::Node) ? o.id : o ]
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
def extract_fragment fragment, options={}
|
39
|
-
node = Node(options[:parent])
|
40
|
-
uri = options[:doc][:uri]
|
41
|
-
|
42
|
-
# Select nodes
|
43
|
-
docs = fragment.sc::selector.map { |s| filter s, options[:doc] }.flatten
|
44
|
-
|
45
|
-
# Generate triples
|
46
|
-
docs.each do |doc|
|
47
|
-
# Build URIs if identifier present
|
48
|
-
nodes = fragment.sc::identifier.map { |s| filter s, doc }.flatten.map do |d|
|
49
|
-
node = Node(parse_uri(uri, d[:value]))
|
50
|
-
if options[:referenceable]
|
51
|
-
# Include the fragment where the URI was built from
|
52
|
-
uri_node = Node(nil)
|
53
|
-
options[:triples] << [ node, Node("sc:uri"), uri_node ]
|
54
|
-
options[:triples] << [ uri_node, Node("rdf:value"), node.to_s ]
|
55
|
-
options[:triples] << [ uri_node, Node("sc:source"), Node(node_hash(d[:uri], d[:content].path)) ]
|
56
|
-
end
|
57
|
-
node
|
58
|
-
end
|
59
|
-
nodes << Node(nil) if nodes.empty?
|
60
|
-
|
61
|
-
nodes.each do |node|
|
62
|
-
# Build the object
|
63
|
-
object = if fragment.sc::type.include?(Node('rdf:Literal'))
|
64
|
-
value = doc[:value].to_s.strip
|
65
|
-
if options[:referenceable]
|
66
|
-
bnode = Node(nil)
|
67
|
-
bnode.rdf::value = value
|
68
|
-
bnode.rdf::type = Node('rdf:Literal')
|
69
|
-
options[:triples].push *bnode.triples
|
70
|
-
bnode
|
71
|
-
else
|
72
|
-
value
|
73
|
-
end
|
74
|
-
else
|
75
|
-
fragment.sc::type.each { |type| options[:triples] << [node, Node('rdf:type'), type] if type != Node('rdf:Resource') }
|
76
|
-
fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
|
77
|
-
fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
|
78
|
-
node
|
79
|
-
end
|
80
|
-
fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
|
81
|
-
|
82
|
-
# Add referenceable data if requested
|
83
|
-
if options[:referenceable]
|
84
|
-
sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
|
85
|
-
sources.each do |source|
|
86
|
-
options[:triples] << [ object, Node("sc:source"), source ]
|
87
|
-
fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
|
88
|
-
fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# Process subfragments
|
93
|
-
fragment.sc::subfragment.each { |subfragment| extract_fragment subfragment, options.merge(:doc=>doc, :parent=>object) }
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def filter selector, doc
|
99
|
-
if selector.sc::debug.first=="true" and options.debug
|
100
|
-
puts '== DEBUG'
|
101
|
-
puts '== Selector:'
|
102
|
-
puts selector.serialize(:yarf, false)
|
103
|
-
puts '== On fragment:'
|
104
|
-
puts "URI: #{doc[:uri]}"
|
105
|
-
puts "Content: #{doc[:content]}"
|
106
|
-
puts "Value: #{doc[:value]}"
|
107
|
-
end
|
108
|
-
|
109
|
-
# Process selector
|
110
|
-
results = selector_pool(selector).filter doc
|
111
|
-
|
112
|
-
if selector.sc::debug.first=="true" and options.debug
|
113
|
-
puts "== No results" if results.empty?
|
114
|
-
results.each_with_index do |result, i|
|
115
|
-
puts "== Result ##{i}:"
|
116
|
-
puts "URI: #{result[:uri]}"
|
117
|
-
puts "Content: #{result[:content]}"
|
118
|
-
puts "Value: #{result[:value].inspect}"
|
119
|
-
end
|
120
|
-
puts
|
121
|
-
end
|
122
|
-
|
123
|
-
# Return results if no nested selectors
|
124
|
-
return results if selector.sc::selector.empty?
|
125
|
-
|
126
|
-
# Process nested selectors
|
127
|
-
results.map do |result|
|
128
|
-
selector.sc::selector.map { |s| filter s, result }
|
129
|
-
end.flatten
|
130
|
-
end
|
131
|
-
|
132
|
-
def parse_uri(uri, rel_uri)
|
133
|
-
return ID('*') if rel_uri.nil?
|
134
|
-
begin
|
135
|
-
ID(URI::parse(uri.split('/')[0..3]*'/').merge(rel_uri).to_s)
|
136
|
-
rescue
|
137
|
-
ID('*')
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def add_referenceable_data content, triples, referenceable
|
142
|
-
resources = {}; triples.each { |s,p,o| resources[o] = true }
|
143
|
-
|
144
|
-
fragment = Node(node_hash(uri, '/'))
|
145
|
-
selector = Node(nil)
|
146
|
-
presentation = Node(nil)
|
147
|
-
|
148
|
-
selector.rdf::type = Node('sc:UnivocalSelector')
|
149
|
-
selector.sc::path = '/'
|
150
|
-
selector.sc::document = uri
|
151
|
-
|
152
|
-
fragment.sc::selector = selector
|
153
|
-
|
154
|
-
triples.push(*fragment.graph.merge(presentation.graph).merge(selector.graph).triples) if referenceable==:dump or resources.include?(fragment)
|
155
|
-
|
156
|
-
content.search('*').each do |node|
|
157
|
-
next if node.text?
|
158
|
-
|
159
|
-
fragment = Node(node_hash(uri, node.path))
|
160
|
-
|
161
|
-
if referenceable == :dump or resources[fragment]
|
162
|
-
selector = Node(nil)
|
163
|
-
presentation = Node(nil)
|
164
|
-
|
165
|
-
triples << [selector, ID('rdf:type'), ID('sc:UnivocalSelector')]
|
166
|
-
triples << [selector, ID('sc:path'), node.path.to_s]
|
167
|
-
triples << [selector, ID('sc:tag'), node.name.to_s]
|
168
|
-
triples << [selector, ID('sc:document'), uri]
|
169
|
-
|
170
|
-
triples << [presentation, ID('sc:x'), node[:vx].to_s] if node[:vx]
|
171
|
-
triples << [presentation, ID('sc:y'), node[:vy].to_s] if node[:vy]
|
172
|
-
triples << [presentation, ID('sc:width'), node[:vw].to_s] if node[:vw]
|
173
|
-
triples << [presentation, ID('sc:height'), node[:vh].to_s] if node[:vh]
|
174
|
-
triples << [presentation, ID('sc:font_size'), node[:vsize].gsub("px","").to_s] if node[:vsize]
|
175
|
-
triples << [presentation, ID('sc:font_weight'), node[:vweight].to_s] if node[:vweight]
|
176
|
-
triples << [presentation, ID('sc:color'), node[:vcolor].to_s] if node[:vcolor]
|
177
|
-
triples << [presentation, ID('sc:background_color'), node[:vbcolor].to_s] if node[:vbcolor]
|
178
|
-
triples << [presentation, ID('sc:text'), node.text.strip]
|
179
|
-
triples << [presentation, ID('sc:children_count'), node.children.select{|n| !n.text?}.size.to_s]
|
180
|
-
|
181
|
-
triples << [fragment, ID('sc:selector'), selector]
|
182
|
-
triples << [fragment, ID('sc:presentation'), presentation]
|
183
|
-
end
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
187
|
-
def node_hash uri, path
|
188
|
-
digest = Digest::MD5.hexdigest("#{uri} #{path}")
|
189
|
-
:"_:bnode#{digest}"
|
190
|
-
end
|
191
|
-
|
192
|
-
def selector_pool selector
|
193
|
-
@selector_pool[selector.id] ||= kb.node(selector)
|
194
|
-
end
|
195
|
-
end
|
196
|
-
end
|