activefacts-compositions 1.9.19 → 1.9.20

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2dd043b17bcbdb5bf4f525fe24022ad95a32946e
4
- data.tar.gz: 95fb5b1b17e005cc0bd90f6f9c527e6455557a87
3
+ metadata.gz: 074d060502e336f08e6e114ace5990b6d890f890
4
+ data.tar.gz: 4ddff75043dc0ce04ef5651fe69b105bee492827
5
5
  SHA512:
6
- metadata.gz: 528cbe042208d8dd290ef5cd5e11ae390261f25c04c4e5558ef111f7fc18aab8a82ea1421ee2147c71d3659c077c93af632ba7372fc337b2797f2a6c69846b30
7
- data.tar.gz: fdf3d9610a0751f91d8b2b8fb022918aa81c31b99a73a359100622cef6f87d04338428253172921bde29e48906b25622915ea89d20fa416e0d156e8fd1695fff
6
+ metadata.gz: cfb954b0e3a018ca4bfc959644e2b8780e98789f500dbc9891629c299a8bc3e641c5830a9ae796938f57be90e65909487154fce22e70700b0bac03a653cf2152
7
+ data.tar.gz: e17f5f23cffeecd28f859a2a71bccfd1306dc2da94aa2a79b2eb34f29c5e121746156fe6fb63b6c2e275290c5b36081bf48834b65825eea71d327d4de29c0d95
data/Gemfile CHANGED
@@ -10,3 +10,7 @@ if this_file =~ %r{\A#{ENV['HOME']}}i and !ENV['USE_INSTALLED']
10
10
  gem 'activefacts-metamodel', path: dir+'/metamodel'
11
11
  gem 'activefacts-cql', path: dir+'/cql'
12
12
  end
13
+
14
+ group :development do
15
+ gem 'byebug'
16
+ end
@@ -1,5 +1,5 @@
1
1
  module ActiveFacts
2
2
  module Compositions
3
- VERSION = "1.9.19"
3
+ VERSION = "1.9.20"
4
4
  end
5
5
  end
@@ -24,9 +24,7 @@ module ActiveFacts
24
24
  options.
25
25
  merge(
26
26
  {
27
- dialect: [String, "SQL Dialect to use"],
28
- value_width: [Integer, "Number of characters to index from long values"],
29
- phonetic_confidence: [Integer, "Percentage confidence for a phonetic match"],
27
+ dialect: [String, "SQL Dialect to use"]
30
28
  }
31
29
  )
32
30
  end
@@ -49,9 +47,6 @@ module ActiveFacts
49
47
  end
50
48
 
51
49
  def process_options options
52
- @value_width = (options.delete('value_width') || 32).to_i
53
- @phonetic_confidence = (options.delete('phonetic_confidence') || 40).to_i
54
-
55
50
  super
56
51
  end
57
52
 
@@ -139,7 +134,7 @@ module ActiveFacts
139
134
 
140
135
  def generate_indicator leaf
141
136
  nil # REVISIT: Do we need anything here?
142
- # select leaf.root, safe_column_name(leaf), 1, column_name(leaf), 1
137
+ # select(leaf.root, safe_column_name(leaf), 1, column_name(leaf))
143
138
  end
144
139
 
145
140
  # This foreign key connects two composites (tables)
@@ -231,57 +226,46 @@ module ActiveFacts
231
226
  when MM::DataType::TYPE_Char,
232
227
  MM::DataType::TYPE_String,
233
228
  MM::DataType::TYPE_Text
234
- # Produce a truncated value with the requested search
229
+ # Produce a select yielding values for the requested search type
235
230
  search_methods.flat_map do |sm|
236
231
  case sm
237
232
  when 'none' # Do not index this value
238
233
  nil
239
234
 
240
235
  when 'simple' # Disregard white-space only
241
- select(composite, truncate(col_expr, @value_width), 'simple', source_field, 1.0)
236
+ select(composite, col_expr, 'simple', source_field)
242
237
 
243
- when 'alpha', # Strip white space and punctuation, just use alphabetic characters
244
- 'typo' # Use trigram similarity to detect typographic errors, over the same values
245
- truncated = truncate(as_alpha(col_expr), @value_width)
246
- select(
247
- composite, truncated, sm, source_field,
248
- "CASE WHEN #{truncated} = #{col_expr} THEN 1.0 ELSE 0.95 END" # Maybe exact match.
249
- )
238
+ when 'alpha' # Strip white space and punctuation, just use alphabetic characters
239
+ select(composite, as_alpha(col_expr), sm, source_field)
250
240
 
251
- when 'phonetic' # Use phonetic matching as well as trigrams
252
- search_expr(composite, intrinsic_type, col_expr, ['typo'], source_field) <<
253
- select(composite, phonetics(col_expr), 'phonetic', source_field, @phonetic_confidence/100.0, true)
241
+ when 'phonetic' # Use phonetic matching as well as trigrams and alpha
242
+ select(composite, as_alpha(col_expr), 'phonetic', source_field, phonetics(col_expr))
254
243
 
255
244
  when 'words' # Break the text into words and match each word like alpha
256
- truncated = truncate(unnest(as_words(col_expr)), @value_width)
257
- select(composite, truncated, sm, source_field, 0.90, true)
245
+ select(composite, unnest(as_words(col_expr)), sm, source_field)
258
246
 
259
247
  when 'names' # Break the text into words and match each word like phonetic
260
- truncated = truncate(unnest(as_words(col_expr, "''-")), @value_width) # N.B. ' is doubled for SQL
261
- search_expr(composite, intrinsic_type, col_expr, ['words'], source_field) <<
262
- phonetics(truncated).map do |phonetic|
263
- select(composite, phonetic, 'names', source_field, @phonetic_confidence/100.0, true)
264
- end
248
+ value = unnest(as_words(col_expr, "''-")) # N.B. ' is doubled for SQL
249
+ phonetic_select(value, select(composite, value, 'names', source_field))
265
250
 
266
251
  when 'text' # Index a large text field using significant words and phrases
267
252
  nil # REVISIT: Implement this type
268
253
 
269
254
  when 'number' # Cast to number and back to text to canonicalise the value;
270
- # If the number doesn't match this regexp, we don't index it.
271
- # This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4)
272
- select(composite, col_expr, 'number', source_field, number_or_null(col_expr))
255
+ # If it doesn't look like a number, we don't index it.
256
+ value = number_or_null(col_expr)
257
+ select(composite, value, 'number', source_field, nil, ["#{value} IS NOT NULL"])
273
258
 
274
259
  when 'phone' # Phone numbers; split, strip each to digits, take the last 8 of each
275
- select(composite, phone_numbers(col_expr), 'phone', source_field, 1)
260
+ select(composite, phone_numbers(col_expr), 'phone', source_field)
276
261
 
277
262
  when 'email' # Use a regexp to find email addresses in this field
278
- select(composite, truncate(email_addresses(col_expr), @value_width), 'email', source_field, 1)
263
+ select(composite, email_addresses(col_expr), 'email', source_field)
279
264
 
280
- when 'date' # Convert string to standard date format if it looks like a date, NULL otherwise
281
- select(
282
- composite, col_expr, 'date', source_field, 1,
283
- %Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{col_expr}::numeric):text ELSE NULL END}
284
- )
265
+ when 'date' # REVISIT: Convert string to standard date format
266
+ # If it doesn't look like a date, we don't index it.
267
+ value = date_or_null(col_expr)
268
+ select(composite, value, 'date', source_field, nil, ["#{value} IS NOT NULL"])
285
269
 
286
270
  else
287
271
  $stderrs.puts "Unknown search method #{sm}"
@@ -295,24 +279,22 @@ module ActiveFacts
295
279
  MM::DataType::TYPE_Real,
296
280
  MM::DataType::TYPE_Decimal,
297
281
  MM::DataType::TYPE_Money
298
- # Produce a right-justified value
299
- # REVISIT: This is a dumb thing to do.
300
- select(composite, lexical_decimal(col_expr, @value_width, value_type.scale), 'simple', source_field, 1)
282
+ select(composite, col_expr, 'simple', source_field)
301
283
 
302
284
  when MM::DataType::TYPE_Date
303
285
  # Produce an ISO representation that sorts lexically (YYYY-MM-DD)
304
286
  # REVISIT: Support search methods here
305
- select(composite, lexical_date(col_expr), 'simple', source_field, 1)
287
+ select(composite, lexical_date(col_expr), 'date', source_field)
306
288
 
307
289
  when MM::DataType::TYPE_DateTime,
308
290
  MM::DataType::TYPE_Timestamp
309
291
  # Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
310
292
  # REVISIT: Support search methods here
311
- select(composite, lexical_datetime(col_expr), 'simple', source_field, 1)
293
+ select(composite, lexical_datetime(col_expr), 'datetime', source_field)
312
294
 
313
295
  when MM::DataType::TYPE_Time
314
296
  # Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
315
- select(composite, lexical_time(col_expr), 'simple', source_field, 1)
297
+ select(composite, lexical_time(col_expr), 'time', source_field)
316
298
 
317
299
  when MM::DataType::TYPE_Binary
318
300
  nil # No indexing applied
@@ -325,35 +307,63 @@ module ActiveFacts
325
307
  name.words.send(@column_case)*@column_joiner
326
308
  end
327
309
 
328
- def select composite, expression, processing, source_field, confidence = 1, distinct = false, where = []
310
+ def field_names
311
+ @field_names ||=
312
+ %w{Value Phonetic Processing SourceTable SourceField LoadBatchID RecordGUID}.
313
+ map{|n| stylise_column_name(n)}
314
+ end
315
+
316
+ def phonetic_select expression, select
317
+ field_list =
318
+ field_names.
319
+ map do |n|
320
+ if n =~ /Phonetic/i
321
+ phonetics(Expression.new(stylise_column_name('Value'), MM::DataType::TYPE_String, true)).to_s + " AS #{n}"
322
+ else
323
+ n
324
+ end
325
+ end.
326
+ join(",\n\t")
327
+
328
+ %Q{
329
+ SELECT DISTINCT
330
+ <FIELDS>
331
+ FROM (<SUB>
332
+ ) AS s}.
333
+ unindent.
334
+ sub(/<FIELDS>/, field_list).
335
+ sub(/<SUB>/, select.gsub(/\n/,"\n\t"))
336
+ end
337
+
338
+ def select composite, expression, processing, source_field, phonetic = nil, conditions = []
329
339
  # These fields are in order of index precedence, to co-locate
330
340
  # comparable values regardless of source record type or column
331
- where << 'Value IS NOT NULL' if expression.to_s =~ /\bNULL\b/
332
- processing_name = stylise_column_name("Processing")
333
- value_name = stylise_column_name("Value")
334
- load_batch_id_name = stylise_column_name("LoadBatchID")
335
- record_guid_name = stylise_column_name("RecordGUID")
336
- confidence_name = stylise_column_name("Confidence")
337
- source_table_name = stylise_column_name("SourceTable")
338
- source_field_name = stylise_column_name("SourceField")
339
- expression_text = expression.to_s
341
+
342
+ select_list =
343
+ [ expression.to_s,
344
+ phonetic ? phonetic.to_s : 'NULL',
345
+ "'"+processing+"'::text",
346
+ "'"+safe_table_name(composite)+"'::text",
347
+ "'"+source_field+"'::text",
348
+ nil,
349
+ nil,
350
+ ].zip(field_names).
351
+ map(&:compact).
352
+ map{|a| a * ' AS '}.
353
+ join(%q{,
354
+ })
355
+ where =
356
+ if conditions.empty?
357
+ ''
358
+ else
359
+ "\nWHERE\t#{conditions*"\n AND\t"}"
360
+ end
340
361
  select = %Q{
341
- SELECT#{distinct ? ' DISTINCT' : ''}
342
- '#{processing}' AS #{processing_name},
343
- #{expression_text} AS #{value_name},
344
- #{load_batch_id_name},
345
- #{confidence} AS #{confidence_name},
346
- #{record_guid_name},
347
- '#{safe_table_name(composite)}' AS #{source_table_name},
348
- '#{source_field}' AS #{source_field_name}
362
+ SELECT DISTINCT
363
+ #{select_list}
349
364
  FROM #{safe_table_name(composite)}}.
350
- unindent
351
-
352
- if where.empty?
353
- select
354
- else
355
- "\nSELECT * FROM (#{select}\n) AS s WHERE #{where*' AND '}"
356
- end
365
+ unindent+
366
+ where
357
367
 
358
368
  end
359
369
 
@@ -166,27 +166,26 @@ module ActiveFacts
166
166
  expr
167
167
  end
168
168
 
169
- # Produce a lexically-sortable decimal representation of the given numeric expression, to the overall specified length and scale
170
- def lexical_decimal expr, length, scale = 0
171
- fraction_pattern = scale > 0 ? '.'+'0'*scale : ''
169
+ def number_or_null expr
170
+ # This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4)
172
171
  Expression.new(
173
- "to_char(#{expr}, 'MI#{'0'*(length-fraction_pattern.length-1)+fraction_pattern})",
174
- MM::DataType::TYPE_String,
175
- expr.is_mandatory
172
+ %Q{CASE WHEN #{expr} ~ '^ *[-+]?([0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{expr}::numeric)::text ELSE NULL END},
173
+ MM::DataType::TYPE_Real,
174
+ false
176
175
  )
177
176
  end
178
177
 
179
- def number_or_null expr
178
+ def date_or_null expr
180
179
  Expression.new(
181
- %Q{CASE WHEN #{expr} ~ '^ *[-+]?([0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN #{expr}::numeric ELSE NULL END},
182
- MM::DataType::TYPE_Real,
180
+ %Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[-/]?[0-9]+[-/][0-9]+ *$' THEN (#{col_expr}::date):text ELSE NULL END},
181
+ MM::DataType::TYPE_Date,
183
182
  false
184
183
  )
185
184
  end
186
185
 
187
- def split_on_separators expr, seps = ',\\\\|'
186
+ def split_on_separators expr, seps = ',|'
188
187
  Expression.new(
189
- %Q{regexp_split_to_table(#{expr}, E'#{seps}')},
188
+ %Q{regexp_split_to_table(#{expr}, E'[#{seps}]')},
190
189
  MM::DataType::TYPE_String, true, true
191
190
  )
192
191
  end
@@ -194,7 +193,7 @@ module ActiveFacts
194
193
  # Extract separated numbers, remove non-digits, take the last 8 (removing area codes etc)
195
194
  def phone_numbers expr
196
195
  Expression.new(
197
- %Q{right(#{split_on_separators(%Q{regexp_replace(#{expr}, '[^0-9]+', '', 'g')})}, 8)},
196
+ %Q{right(regexp_replace(#{split_on_separators(expr)}, '[^0-9]+', '', 'g'), 8)},
198
197
  MM::DataType::TYPE_String,
199
198
  true
200
199
  )
@@ -237,26 +236,11 @@ module ActiveFacts
237
236
  end
238
237
 
239
238
  def phonetics expr
240
- if expr.is_array
241
- [
242
- Expression.new(
243
- %Q{dmetaphone(#{expr})},
244
- MM::DataType::TYPE_String,
245
- expr.is_mandatory
246
- ),
247
- Expression.new(
248
- %Q{dmetaphone_alt(#{expr})},
249
- MM::DataType::TYPE_String,
250
- expr.is_mandatory
251
- )
252
- ]
253
- else
254
- Expression.new(
255
- %Q{unnest(ARRAY[dmetaphone(#{expr}), dmetaphone_alt(#{expr})])},
256
- MM::DataType::TYPE_String,
257
- expr.is_mandatory
258
- )
259
- end
239
+ Expression.new(
240
+ %Q{unnest(ARRAY[dmetaphone(#{expr}), dmetaphone_alt(#{expr})])},
241
+ MM::DataType::TYPE_String,
242
+ expr.is_mandatory
243
+ )
260
244
  end
261
245
 
262
246
  # Reserved words cannot be used anywhere without quoting.
@@ -264,7 +248,7 @@ module ActiveFacts
264
248
  # Both lists here are added to the supertype's lists
265
249
  def reserved_words
266
250
  @postgres_reserved_words ||= %w{
267
- ANALYSE ANALYZE LIMIT PLACING RETURNING VARIADIC
251
+ ANALYSE ANALYZE LIMIT PLACING RETURNING SYMMETRIC VARIADIC
268
252
  }
269
253
  super + @postgres_reserved_words
270
254
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: activefacts-compositions
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.19
4
+ version: 1.9.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - Clifford Heath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-12 00:00:00.000000000 Z
11
+ date: 2018-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler