graffiti 2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,568 @@
1
+ # Graffiti RDF Store
2
+ # (originally written for Samizdat project)
3
+ #
4
+ # Copyright (c) 2002-2011 Dmitry Borodaenko <angdraug@debian.org>
5
+ #
6
+ # This program is free software.
7
+ # You can distribute/modify this program under the terms of
8
+ # the GNU General Public License version 3 or later.
9
+ #
10
+ # see doc/rdf-storage.txt for introduction and Graffiti Squish definition;
11
+ # see doc/storage-impl.txt for explanation of implemented algorithms
12
+ #
13
+ # vim: et sw=2 sts=2 ts=8 tw=0
14
+
15
+ require 'graffiti/exceptions'
16
+ require 'graffiti/sql_mapper'
17
+
18
+ module Graffiti
19
+
20
+ # parse Squish query and translate triples to relational conditions
21
+ #
22
+ # provides access to internal representation of the parsed query and utility
23
+ # functions to deal with Squish syntax
24
+ #
25
+ class SquishQuery
26
+ include Debug
27
+
28
+ # regexp for internal resource reference
29
+ INTERNAL = Regexp.new(/\A([[:digit:]]+)\z/).freeze
30
+
31
+ # regexp for blank node mark and name
32
+ BN = Regexp.new(/\A\?([[:alnum:]_]+)\z/).freeze
33
+
34
+ # regexp for scanning blank nodes inside a string
35
+ BN_SCAN = Regexp.new(/\?[[:alnum:]_]+?\b/).freeze
36
+
37
+ # regexp for parametrized value
38
+ PARAMETER = Regexp.new(/\A:([[:alnum:]_]+)\z/).freeze
39
+
40
+ # regexp for replaced string literal
41
+ LITERAL = Regexp.new(/\A'(\d+)'\z/).freeze
42
+
43
+ # regexp for scanning replaced string literals in a string
44
+ LITERAL_SCAN = Regexp.new(/'(\d+)'/).freeze
45
+
46
+ # regexp for scanning query parameters inside a string
47
+ PARAMETER_AND_LITERAL_SCAN = Regexp.new(/\B:([[:alnum:]_]+)|'(\d+)'/).freeze
48
+
49
+ # regexp for number
50
+ NUMBER = Regexp.new(/\A-?[[:digit:]]+(\.[[:digit:]]+)?\z/).freeze
51
+
52
+ # regexp for operator
53
+ OPERATOR = Regexp.new(/\A(\+|-|\*|\/|\|\||<|<=|>|>=|=|!=|@@|to_tsvector|to_tsquery|I?LIKE|NOT|AND|OR|IN|IS|NULL)\z/i).freeze
54
+
55
+ # regexp for aggregate function
56
+ AGGREGATE = Regexp.new(/\A(avg|count|max|min|sum)\z/i).freeze
57
+
58
+ QUERY = Regexp.new(/\A\s*(SELECT|INSERT|UPDATE)\b\s*(.*?)\s*
59
+ \bWHERE\b\s*(.*?)\s*
60
+ (?:\bEXCEPT\b\s*(.*?))?\s*
61
+ (?:\bOPTIONAL\b\s*(.*?))?\s*
62
+ (?:\bLITERAL\b\s*(.*?))?\s*
63
+ (?:\bGROUP\s+BY\b\s*(.*?))?\s*
64
+ (?:\bORDER\s+BY\b\s*(.*?)\s*(ASC|DESC)?)?\s*
65
+ (?:\bUSING\b\s*(.*?))?\s*\z/mix).freeze
66
+
67
+ # extract common Squish query sections, perform namespace substitution,
68
+ # generate query pattern graph, call transform_pattern,
69
+ # determine query type and parse nodes section accordingly
70
+ #
71
+ def initialize(config, query)
72
+ query.nil? and raise ProgrammingError, "SquishQuery: query can't be nil"
73
+ if query.kind_of? Hash # pre-parsed query (used by SquishAssert)
74
+ @nodes = query[:nodes]
75
+ @pattern = query[:pattern]
76
+ @negative = query[:negative]
77
+ @optional = query[:optional]
78
+ @strings = query[:strings]
79
+ @literal = @group = @order = ''
80
+ @sql_mapper = SqlMapper.new(config, @pattern)
81
+ return self
82
+ elsif not query.kind_of? String
83
+ raise ProgrammingError,
84
+ "Bad query initialization parameter class: #{query.class}"
85
+ end
86
+
87
+ debug { query }
88
+ @query = query # keep original string
89
+ query = query.dup
90
+
91
+ # replace string literals with 'n' placeholders (also see #substitute_literals)
92
+ @strings = []
93
+ query.gsub!(/'((?:''|[^'])*)'/m) do
94
+ @strings.push $1.gsub("''", "'") # keep unescaped string
95
+ "'" + (@strings.size - 1).to_s + "'"
96
+ end
97
+
98
+ match = QUERY.match(query) or raise ProgrammingError,
99
+ "Malformed query: are keywords SELECT, INSERT, UPDATE or WHERE missing?"
100
+ match, @key, @nodes, @pattern, @negative, @optional, @literal,
101
+ @group, @order, @order_dir, @ns = match.to_a.collect {|m| m.to_s }
102
+ match = nil
103
+ @key.upcase!
104
+ @order_dir.upcase!
105
+
106
+ # namespaces
107
+ # todo: validate ns
108
+ @ns = (@ns.empty? or /\APRESET\s+NS\z/ =~ @ns) ? config.ns :
109
+ Hash[*@ns.gsub(/\b(FOR|AS|AND)\b/i, '').scan(/\S+/)]
110
+ @pattern = parse_pattern(@pattern)
111
+ @optional = parse_pattern(@optional)
112
+ @negative = parse_pattern(@negative)
113
+
114
+ # validate SQL expressions
115
+ validate_expression(@literal)
116
+ @group.split(/\s*,\s*/).each {|group| validate_expression(group) }
117
+ validate_expression(@order)
118
+
119
+ @sql_mapper = SqlMapper.new(
120
+ config, @pattern, @negative, @optional, @literal)
121
+
122
+ # check that all variables can be bound
123
+ @variables = query.scan(BN_SCAN)
124
+ @variables.each {|node| @sql_mapper.bind(node) }
125
+
126
+ return self
127
+ end
128
+
129
+ # blank variables control section
130
+ attr_reader :nodes
131
+
132
+ # query pattern graph as array of triples [ [p, s, o], ... ]
133
+ attr_reader :pattern
134
+
135
+ # literal SQL expression
136
+ attr_reader :literal
137
+
138
+ # SQL GROUP BY expression
139
+ attr_reader :group
140
+
141
+ # SQL order expression
142
+ attr_reader :order
143
+
144
+ # direction of order, ASC or DESC
145
+ attr_reader :order_dir
146
+
147
+ # query namespaces mapping
148
+ attr_reader :ns
149
+
150
+ # list of variables defined in the query
151
+ attr_reader :variables
152
+
153
+ # returns original string passed in for parsing
154
+ #
155
+ def to_s
156
+ @query
157
+ end
158
+
159
+ # replace 'n' substitutions with query string literals (see #new, #LITERAL)
160
+ #
161
+ def substitute_literals(s)
162
+ return s unless s.kind_of? String
163
+ s.gsub(LITERAL_SCAN) do
164
+ get_literal_value($1.to_i)
165
+ end
166
+ end
167
+
168
+ # replace schema uri with namespace prefix
169
+ #
170
+ def SquishQuery.uri_shrink!(uriref, prefix, uri)
171
+ uriref.gsub!(/\A#{uri}([^\/#]+)\z/) {"#{prefix}::#{$1}"}
172
+ end
173
+
174
+ # replace schema uri with a prefix from a supplied namespaces hash
175
+ #
176
+ def SquishQuery.ns_shrink(uriref, namespaces)
177
+ u = uriref.dup or return nil
178
+ namespaces.each {|p, uri| SquishQuery.uri_shrink!(u, p, uri) and break }
179
+ return u
180
+ end
181
+
182
+ # replace schema uri with a prefix from query namespaces
183
+ #
184
+ def ns_shrink(uriref)
185
+ SquishQuery.ns_shrink(uriref, @ns)
186
+ end
187
+
188
+ # validate expression
189
+ #
190
+ # expression := value [ operator expression ]
191
+ #
192
+ # value := blank_node | literal_string | number | '(' expression ')'
193
+ #
194
+ # whitespace between tokens (except inside parentheses) is mandatory
195
+ #
196
+ def validate_expression(string)
197
+ # todo: lexical analyser
198
+ string.split(/[\s(),]+/).each do |token|
199
+ case token
200
+ when '', BN, PARAMETER, LITERAL, NUMBER, OPERATOR, AGGREGATE
201
+ else
202
+ raise ProgrammingError, "Bad token '#{token}' in expression"
203
+ end
204
+ end
205
+ string
206
+ end
207
+
208
+ private
209
+
210
+ PATTERN_SCAN = Regexp.new(/\A\((\S+)\s+(\S+)\s+(.*?)(?:\s+FILTER\b\s*(.*?)\s*)?(?:\s+(TRANSITIVE)\s*)?\)\z/).freeze
211
+
212
+ # parse query pattern graph out of a string, expand URI namespaces
213
+ #
214
+ def parse_pattern(pattern)
215
+ pattern.scan(/\(.*?\)(?=\s*(?:\(|\z))/).collect do |c|
216
+ match, predicate, subject, object, filter, transitive = c.match(PATTERN_SCAN).to_a
217
+ match = nil
218
+
219
+ [predicate, subject, object].each do |u|
220
+ u.sub!(/\A(\S+?)::/) do
221
+ @ns[$1] or raise ProgrammingError, "Undefined namespace prefix #{$1}"
222
+ end
223
+ end
224
+
225
+ validate_expression(filter.to_s)
226
+
227
+ [predicate, subject, object, filter, 'TRANSITIVE' == transitive]
228
+ end
229
+ end
230
+
231
+ # replace RDF query parameters with their values
232
+ #
233
+ def expression_value(expr, params={})
234
+ case expr
235
+ when 'NULL'
236
+ nil
237
+ when PARAMETER
238
+ get_parameter_value($1, params)
239
+ when LITERAL
240
+ @strings[$1.to_i]
241
+ else
242
+ expr.gsub(PARAMETER_AND_LITERAL_SCAN) do
243
+ if $1 # parameter
244
+ get_parameter_value($1, params)
245
+ else # literal
246
+ get_literal_value($2.to_i)
247
+ end
248
+ end
249
+ # fixme: make Sequel treat it as SQL expression, not a string value
250
+ end
251
+ end
252
+
253
+ def get_parameter_value(name, params)
254
+ key = name.to_sym
255
+ params.has_key?(key) or raise ProgrammingError,
256
+ 'Unknown parameter :' + name
257
+ params[key]
258
+ end
259
+
260
+ def get_literal_value(i)
261
+ "'" + @strings[i].gsub("'", "''") + "'"
262
+ end
263
+ end
264
+
265
+
266
+ class SquishSelect < SquishQuery
267
+ def initialize(config, query)
268
+ super(config, query)
269
+
270
+ if @key # initialized from a String, not a Hash
271
+ 'SELECT' == @key or raise ProgrammingError,
272
+ 'Wrong query type: SELECT expected intead of ' + @key
273
+
274
+ @nodes = @nodes.split(/\s*,\s*/).map {|node|
275
+ validate_expression(node)
276
+ }
277
+ end
278
+ end
279
+
280
+ # translate Squish SELECT query to SQL
281
+ #
282
+ def to_sql
283
+ where = @sql_mapper.where
284
+
285
+ select = @nodes.dup
286
+ select.push(@order) unless @order.empty? or @nodes.include?(@order)
287
+
288
+ # now put it all together
289
+ sql = %{\nFROM #{@sql_mapper.from}}
290
+ sql << %{\nWHERE #{where}} unless where.empty?
291
+ sql << %{\nGROUP BY #{@group}} unless @group.empty?
292
+ sql << %{\nORDER BY #{@order} #{@order_dir}} unless @order.empty?
293
+
294
+ select = select.map do |expr|
295
+ bind_blank_nodes(expr) + (BN.match(expr) ? (' AS ' + $1) : '')
296
+ end
297
+ sql = 'SELECT DISTINCT ' << select.join(', ') << bind_blank_nodes(sql)
298
+
299
+ sql =~ /\?/ and raise ProgrammingError,
300
+ "Unexpected '?' in translated query (probably, caused by unmapped blank node): #{sql.gsub(/\s+/, ' ')};"
301
+
302
+ substitute_literals(sql)
303
+ end
304
+
305
+ private
306
+
307
+ # replace blank node names with bindings
308
+ #
309
+ def bind_blank_nodes(sql)
310
+ sql.gsub(BN_SCAN) {|node| @sql_mapper.bind(node) }
311
+ end
312
+ end
313
+
314
+
315
+ class SquishAssert < SquishQuery
316
+ def initialize(config, query)
317
+ @config = config
318
+ super(@config, query)
319
+
320
+ if 'UPDATE' == @key
321
+ @insert = ''
322
+ @update = @nodes
323
+
324
+ elsif 'INSERT' == @key and @nodes =~ /\A\s*(.*?)\s*(?:\bUPDATE\b\s*(.*?))?\s*\z/
325
+ @insert, @update = $1, $2.to_s
326
+
327
+ else
328
+ raise ProgrammingError,
329
+ "Wrong query type: INSERT or UPDATE expected instead of " + @key
330
+ end
331
+
332
+ @insert = @insert.split(/\s*,\s*/).each {|s|
333
+ s =~ BN or raise ProgrammingError,
334
+ "Blank node expected in INSERT section instead of '#{s}'"
335
+ }
336
+
337
+ @update = @update.empty? ? {} : Hash[*@update.split(/\s*,\s*/).collect {|s|
338
+ s.split(/\s*=\s*/)
339
+ }.each {|node, value|
340
+ node =~ BN or raise ProgrammingError,
341
+ "Blank node expected on the left side of UPDATE assignment instead of '#{bn}'"
342
+ validate_expression(value)
343
+ }.flatten!]
344
+ end
345
+
346
+ def run(db, params={})
347
+ values = resource_values(db, params)
348
+
349
+ statements = []
350
+ alias_positions.each do |alias_, clauses|
351
+ statement = SquishAssertStatement.new(clauses, values)
352
+ statements.push(statement) if statement.action
353
+ end
354
+ SquishAssertStatement.run_ordered_statements(db, statements)
355
+
356
+ return @insert.collect {|node| values[node].value }
357
+ end
358
+
359
+ attr_reader :insert, :update
360
+
361
+ private
362
+
363
+ def resource_values(db, params)
364
+ values = {}
365
+ @sql_mapper.nodes.each do |node, n|
366
+ new = false
367
+
368
+ if node =~ INTERNAL # internal resource
369
+ value = $1.to_i # resource id
370
+
371
+ elsif node =~ PARAMETER
372
+ value = get_parameter_value($1, params)
373
+
374
+ elsif node =~ LITERAL
375
+ value = @strings[$1.to_i]
376
+
377
+ elsif node =~ BN
378
+ subject_position = n[:positions].select {|p| :subject == p[:role] }.first
379
+
380
+ if subject_position.nil? # blank node occuring only in object position
381
+ value = @update[node] or raise ProgrammingError,
382
+ %{Blank node #{node} is undefined (drop it or set its value in UPDATE section)}
383
+ value = expression_value(value, params)
384
+
385
+ else # resource blank node
386
+ unless @insert.include?(node)
387
+ s = SquishSelect.new(
388
+ @config, {
389
+ :nodes => [node],
390
+ :pattern => subgraph(node),
391
+ :strings => @strings
392
+ }
393
+ )
394
+ debug { db[s.to_sql, params].select_sql }
395
+ found = db.fetch(s.to_sql, params).first
396
+ end
397
+
398
+ if found
399
+ value = found.values.first
400
+
401
+ else
402
+ table = @sql_mapper.clauses[ subject_position[:clause] ][:map].table
403
+ value = db[:resource].insert(:label => table)
404
+ debug { db[:resource].insert_sql(:label => table) }
405
+ new = true unless 'resource' == table
406
+ end
407
+ end
408
+
409
+ else # external resource
410
+ uriref = { :uriref => true, :label => node }
411
+ found = db[:resource].filter(uriref).first
412
+ if found
413
+ value = found[:id]
414
+ else
415
+ value = db[:resource].insert(uriref)
416
+ debug { db[:resource].insert_sql(uriref) }
417
+ end
418
+ end
419
+
420
+ debug { node + ' = ' + value.inspect }
421
+ v = SquishAssertValue.new(value, new, @update.has_key?(node))
422
+ values[node] = v
423
+ end
424
+
425
+ debug { values.inspect }
426
+ values
427
+ end
428
+
429
+ def alias_positions
430
+ a = {}
431
+ @sql_mapper.clauses.each_with_index do |clause, i|
432
+ a[ clause[:alias] ] ||= []
433
+ a[ clause[:alias] ].push(clause)
434
+ end
435
+ a
436
+ end
437
+
438
+ # calculate subgraph of query pattern that is reachable from _node_
439
+ #
440
+ # fixme: make it work with optional sub-patterns
441
+ #
442
+ def subgraph(node)
443
+ subgraph = [node]
444
+ w = []
445
+ begin
446
+ stop = true
447
+ @pattern.each do |triple|
448
+ if subgraph.include? triple[1] and not w.include? triple
449
+ subgraph.push triple[2]
450
+ w.push triple
451
+ stop = false
452
+ end
453
+ end
454
+ end until stop
455
+ return w
456
+ end
457
+ end
458
+
459
+
460
+ class SquishAssertValue
461
+ def initialize(value, new, updated)
462
+ @value = value
463
+ @new = new
464
+ @updated = updated
465
+ end
466
+
467
+ attr_reader :value
468
+
469
+ # true if node was inserted into resource during value generation and a
470
+ # corresponding record should be inserted into an internal resource table
471
+ # later
472
+ #
473
+ def new?
474
+ @new
475
+ end
476
+
477
+ # true if the node value is set in the UPDATE section of the Squish statement
478
+ #
479
+ def updated?
480
+ @updated
481
+ end
482
+ end
483
+
484
+
485
+ class SquishAssertStatement
486
+ include Debug
487
+
488
+ def initialize(clauses, values)
489
+ @key_node = clauses.first[:subject][:node]
490
+ @table = clauses.first[:map].table.to_sym
491
+
492
+ key = values[@key_node]
493
+
494
+ @params = {}
495
+ @references = []
496
+ clauses.each do |clause|
497
+ node = clause[:object][:node]
498
+ v = values[node]
499
+
500
+ if key.new? or v.updated?
501
+ field = clause[:object][:field]
502
+ @params[field.to_sym] = v.value
503
+
504
+ # when subproperty value is updated, update the qualifier as well
505
+ map = clause[:map]
506
+ if map.subproperty_of
507
+ @params[ RdfPropertyMap.qualifier_field(field).to_sym ] = values[map.property].value
508
+ elsif map.superproperty?
509
+ @params[ RdfPropertyMap.qualifier_field(field).to_sym ] = nil
510
+ end
511
+
512
+ @references.push(node) if v.new?
513
+ end
514
+ end
515
+
516
+ if key.new? and @table != :resource
517
+ # when id is inserted, insert_resource() trigger does nothing
518
+ @action = :insert
519
+ @params[:id] = key.value
520
+
521
+ elsif not @params.empty?
522
+ @action = :update
523
+ @filter = {:id => key.value}
524
+ end
525
+
526
+ debug { self.inspect }
527
+ end
528
+
529
+ attr_reader :key_node, :references, :action
530
+
531
+ def run(db)
532
+ if @action
533
+ ds = db[@table]
534
+ ds = ds.filter(@filter) if @filter
535
+ debug { :insert == @action ? ds.insert_sql(@params) : ds.update_sql(@params) }
536
+ ds.send(@action, @params)
537
+ end
538
+ end
539
+
540
+ # make sure mutually referencing records are inserted in the right order
541
+ #
542
+ def SquishAssertStatement.run_ordered_statements(db, statements)
543
+ statements = statements.sort_by {|s| s.references.size }
544
+ inserted = []
545
+
546
+ progress = true
547
+ until statements.empty? or not progress
548
+ progress = false
549
+
550
+ 0.upto(statements.size - 1) do |i|
551
+ s = statements[i]
552
+ if (s.references - inserted).empty?
553
+ s.run(db)
554
+ inserted.push(s.key_node)
555
+ statements.delete_at(i)
556
+ progress = true
557
+ break
558
+ end
559
+ end
560
+ end
561
+
562
+ statements.empty? or raise ProgrammingError,
563
+ "Failed to resolve mutual references of inserted resources: " +
564
+ statements.collect {|s| s.key_node + ' -- ' + s.references.join(', ') }.join('; ')
565
+ end
566
+ end
567
+
568
+ end