activefacts-compositions 1.9.19 → 1.9.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2dd043b17bcbdb5bf4f525fe24022ad95a32946e
4
- data.tar.gz: 95fb5b1b17e005cc0bd90f6f9c527e6455557a87
3
+ metadata.gz: 074d060502e336f08e6e114ace5990b6d890f890
4
+ data.tar.gz: 4ddff75043dc0ce04ef5651fe69b105bee492827
5
5
  SHA512:
6
- metadata.gz: 528cbe042208d8dd290ef5cd5e11ae390261f25c04c4e5558ef111f7fc18aab8a82ea1421ee2147c71d3659c077c93af632ba7372fc337b2797f2a6c69846b30
7
- data.tar.gz: fdf3d9610a0751f91d8b2b8fb022918aa81c31b99a73a359100622cef6f87d04338428253172921bde29e48906b25622915ea89d20fa416e0d156e8fd1695fff
6
+ metadata.gz: cfb954b0e3a018ca4bfc959644e2b8780e98789f500dbc9891629c299a8bc3e641c5830a9ae796938f57be90e65909487154fce22e70700b0bac03a653cf2152
7
+ data.tar.gz: e17f5f23cffeecd28f859a2a71bccfd1306dc2da94aa2a79b2eb34f29c5e121746156fe6fb63b6c2e275290c5b36081bf48834b65825eea71d327d4de29c0d95
data/Gemfile CHANGED
@@ -10,3 +10,7 @@ if this_file =~ %r{\A#{ENV['HOME']}}i and !ENV['USE_INSTALLED']
10
10
  gem 'activefacts-metamodel', path: dir+'/metamodel'
11
11
  gem 'activefacts-cql', path: dir+'/cql'
12
12
  end
13
+
14
+ group :development do
15
+ gem 'byebug'
16
+ end
@@ -1,5 +1,5 @@
1
1
  module ActiveFacts
2
2
  module Compositions
3
- VERSION = "1.9.19"
3
+ VERSION = "1.9.20"
4
4
  end
5
5
  end
@@ -24,9 +24,7 @@ module ActiveFacts
24
24
  options.
25
25
  merge(
26
26
  {
27
- dialect: [String, "SQL Dialect to use"],
28
- value_width: [Integer, "Number of characters to index from long values"],
29
- phonetic_confidence: [Integer, "Percentage confidence for a phonetic match"],
27
+ dialect: [String, "SQL Dialect to use"]
30
28
  }
31
29
  )
32
30
  end
@@ -49,9 +47,6 @@ module ActiveFacts
49
47
  end
50
48
 
51
49
  def process_options options
52
- @value_width = (options.delete('value_width') || 32).to_i
53
- @phonetic_confidence = (options.delete('phonetic_confidence') || 40).to_i
54
-
55
50
  super
56
51
  end
57
52
 
@@ -139,7 +134,7 @@ module ActiveFacts
139
134
 
140
135
  def generate_indicator leaf
141
136
  nil # REVISIT: Do we need anything here?
142
- # select leaf.root, safe_column_name(leaf), 1, column_name(leaf), 1
137
+ # select(leaf.root, safe_column_name(leaf), 1, column_name(leaf))
143
138
  end
144
139
 
145
140
  # This foreign key connects two composites (tables)
@@ -231,57 +226,46 @@ module ActiveFacts
231
226
  when MM::DataType::TYPE_Char,
232
227
  MM::DataType::TYPE_String,
233
228
  MM::DataType::TYPE_Text
234
- # Produce a truncated value with the requested search
229
+ # Produce a select yielding values for the requested search type
235
230
  search_methods.flat_map do |sm|
236
231
  case sm
237
232
  when 'none' # Do not index this value
238
233
  nil
239
234
 
240
235
  when 'simple' # Disregard white-space only
241
- select(composite, truncate(col_expr, @value_width), 'simple', source_field, 1.0)
236
+ select(composite, col_expr, 'simple', source_field)
242
237
 
243
- when 'alpha', # Strip white space and punctuation, just use alphabetic characters
244
- 'typo' # Use trigram similarity to detect typographic errors, over the same values
245
- truncated = truncate(as_alpha(col_expr), @value_width)
246
- select(
247
- composite, truncated, sm, source_field,
248
- "CASE WHEN #{truncated} = #{col_expr} THEN 1.0 ELSE 0.95 END" # Maybe exact match.
249
- )
238
+ when 'alpha' # Strip white space and punctuation, just use alphabetic characters
239
+ select(composite, as_alpha(col_expr), sm, source_field)
250
240
 
251
- when 'phonetic' # Use phonetic matching as well as trigrams
252
- search_expr(composite, intrinsic_type, col_expr, ['typo'], source_field) <<
253
- select(composite, phonetics(col_expr), 'phonetic', source_field, @phonetic_confidence/100.0, true)
241
+ when 'phonetic' # Use phonetic matching as well as trigrams and alpha
242
+ select(composite, as_alpha(col_expr), 'phonetic', source_field, phonetics(col_expr))
254
243
 
255
244
  when 'words' # Break the text into words and match each word like alpha
256
- truncated = truncate(unnest(as_words(col_expr)), @value_width)
257
- select(composite, truncated, sm, source_field, 0.90, true)
245
+ select(composite, unnest(as_words(col_expr)), sm, source_field)
258
246
 
259
247
  when 'names' # Break the text into words and match each word like phonetic
260
- truncated = truncate(unnest(as_words(col_expr, "''-")), @value_width) # N.B. ' is doubled for SQL
261
- search_expr(composite, intrinsic_type, col_expr, ['words'], source_field) <<
262
- phonetics(truncated).map do |phonetic|
263
- select(composite, phonetic, 'names', source_field, @phonetic_confidence/100.0, true)
264
- end
248
+ value = unnest(as_words(col_expr, "''-")) # N.B. ' is doubled for SQL
249
+ phonetic_select(value, select(composite, value, 'names', source_field))
265
250
 
266
251
  when 'text' # Index a large text field using significant words and phrases
267
252
  nil # REVISIT: Implement this type
268
253
 
269
254
  when 'number' # Cast to number and back to text to canonicalise the value;
270
- # If the number doesn't match this regexp, we don't index it.
271
- # This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4)
272
- select(composite, col_expr, 'number', source_field, number_or_null(col_expr))
255
+ # If it doesn't look like a number, we don't index it.
256
+ value = number_or_null(col_expr)
257
+ select(composite, value, 'number', source_field, nil, ["#{value} IS NOT NULL"])
273
258
 
274
259
  when 'phone' # Phone numbers; split, strip each to digits, take the last 8 of each
275
- select(composite, phone_numbers(col_expr), 'phone', source_field, 1)
260
+ select(composite, phone_numbers(col_expr), 'phone', source_field)
276
261
 
277
262
  when 'email' # Use a regexp to find email addresses in this field
278
- select(composite, truncate(email_addresses(col_expr), @value_width), 'email', source_field, 1)
263
+ select(composite, email_addresses(col_expr), 'email', source_field)
279
264
 
280
- when 'date' # Convert string to standard date format if it looks like a date, NULL otherwise
281
- select(
282
- composite, col_expr, 'date', source_field, 1,
283
- %Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{col_expr}::numeric):text ELSE NULL END}
284
- )
265
+ when 'date' # REVISIT: Convert string to standard date format
266
+ # If it doesn't look like a date, we don't index it.
267
+ value = date_or_null(col_expr)
268
+ select(composite, value, 'date', source_field, nil, ["#{value} IS NOT NULL"])
285
269
 
286
270
  else
287
271
  $stderrs.puts "Unknown search method #{sm}"
@@ -295,24 +279,22 @@ module ActiveFacts
295
279
  MM::DataType::TYPE_Real,
296
280
  MM::DataType::TYPE_Decimal,
297
281
  MM::DataType::TYPE_Money
298
- # Produce a right-justified value
299
- # REVISIT: This is a dumb thing to do.
300
- select(composite, lexical_decimal(col_expr, @value_width, value_type.scale), 'simple', source_field, 1)
282
+ select(composite, col_expr, 'simple', source_field)
301
283
 
302
284
  when MM::DataType::TYPE_Date
303
285
  # Produce an ISO representation that sorts lexically (YYYY-MM-DD)
304
286
  # REVISIT: Support search methods here
305
- select(composite, lexical_date(col_expr), 'simple', source_field, 1)
287
+ select(composite, lexical_date(col_expr), 'date', source_field)
306
288
 
307
289
  when MM::DataType::TYPE_DateTime,
308
290
  MM::DataType::TYPE_Timestamp
309
291
  # Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
310
292
  # REVISIT: Support search methods here
311
- select(composite, lexical_datetime(col_expr), 'simple', source_field, 1)
293
+ select(composite, lexical_datetime(col_expr), 'datetime', source_field)
312
294
 
313
295
  when MM::DataType::TYPE_Time
314
296
  # Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
315
- select(composite, lexical_time(col_expr), 'simple', source_field, 1)
297
+ select(composite, lexical_time(col_expr), 'time', source_field)
316
298
 
317
299
  when MM::DataType::TYPE_Binary
318
300
  nil # No indexing applied
@@ -325,35 +307,63 @@ module ActiveFacts
325
307
  name.words.send(@column_case)*@column_joiner
326
308
  end
327
309
 
328
- def select composite, expression, processing, source_field, confidence = 1, distinct = false, where = []
310
+ def field_names
311
+ @field_names ||=
312
+ %w{Value Phonetic Processing SourceTable SourceField LoadBatchID RecordGUID}.
313
+ map{|n| stylise_column_name(n)}
314
+ end
315
+
316
+ def phonetic_select expression, select
317
+ field_list =
318
+ field_names.
319
+ map do |n|
320
+ if n =~ /Phonetic/i
321
+ phonetics(Expression.new(stylise_column_name('Value'), MM::DataType::TYPE_String, true)).to_s + " AS #{n}"
322
+ else
323
+ n
324
+ end
325
+ end.
326
+ join(",\n\t")
327
+
328
+ %Q{
329
+ SELECT DISTINCT
330
+ <FIELDS>
331
+ FROM (<SUB>
332
+ ) AS s}.
333
+ unindent.
334
+ sub(/<FIELDS>/, field_list).
335
+ sub(/<SUB>/, select.gsub(/\n/,"\n\t"))
336
+ end
337
+
338
+ def select composite, expression, processing, source_field, phonetic = nil, conditions = []
329
339
  # These fields are in order of index precedence, to co-locate
330
340
  # comparable values regardless of source record type or column
331
- where << 'Value IS NOT NULL' if expression.to_s =~ /\bNULL\b/
332
- processing_name = stylise_column_name("Processing")
333
- value_name = stylise_column_name("Value")
334
- load_batch_id_name = stylise_column_name("LoadBatchID")
335
- record_guid_name = stylise_column_name("RecordGUID")
336
- confidence_name = stylise_column_name("Confidence")
337
- source_table_name = stylise_column_name("SourceTable")
338
- source_field_name = stylise_column_name("SourceField")
339
- expression_text = expression.to_s
341
+
342
+ select_list =
343
+ [ expression.to_s,
344
+ phonetic ? phonetic.to_s : 'NULL',
345
+ "'"+processing+"'::text",
346
+ "'"+safe_table_name(composite)+"'::text",
347
+ "'"+source_field+"'::text",
348
+ nil,
349
+ nil,
350
+ ].zip(field_names).
351
+ map(&:compact).
352
+ map{|a| a * ' AS '}.
353
+ join(%q{,
354
+ })
355
+ where =
356
+ if conditions.empty?
357
+ ''
358
+ else
359
+ "\nWHERE\t#{conditions*"\n AND\t"}"
360
+ end
340
361
  select = %Q{
341
- SELECT#{distinct ? ' DISTINCT' : ''}
342
- '#{processing}' AS #{processing_name},
343
- #{expression_text} AS #{value_name},
344
- #{load_batch_id_name},
345
- #{confidence} AS #{confidence_name},
346
- #{record_guid_name},
347
- '#{safe_table_name(composite)}' AS #{source_table_name},
348
- '#{source_field}' AS #{source_field_name}
362
+ SELECT DISTINCT
363
+ #{select_list}
349
364
  FROM #{safe_table_name(composite)}}.
350
- unindent
351
-
352
- if where.empty?
353
- select
354
- else
355
- "\nSELECT * FROM (#{select}\n) AS s WHERE #{where*' AND '}"
356
- end
365
+ unindent+
366
+ where
357
367
 
358
368
  end
359
369
 
@@ -166,27 +166,26 @@ module ActiveFacts
166
166
  expr
167
167
  end
168
168
 
169
- # Produce a lexically-sortable decimal representation of the given numeric expression, to the overall specified length and scale
170
- def lexical_decimal expr, length, scale = 0
171
- fraction_pattern = scale > 0 ? '.'+'0'*scale : ''
169
+ def number_or_null expr
170
+ # This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4)
172
171
  Expression.new(
173
- "to_char(#{expr}, 'MI#{'0'*(length-fraction_pattern.length-1)+fraction_pattern})",
174
- MM::DataType::TYPE_String,
175
- expr.is_mandatory
172
+ %Q{CASE WHEN #{expr} ~ '^ *[-+]?([0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{expr}::numeric)::text ELSE NULL END},
173
+ MM::DataType::TYPE_Real,
174
+ false
176
175
  )
177
176
  end
178
177
 
179
- def number_or_null expr
178
+ def date_or_null expr
180
179
  Expression.new(
181
- %Q{CASE WHEN #{expr} ~ '^ *[-+]?([0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN #{expr}::numeric ELSE NULL END},
182
- MM::DataType::TYPE_Real,
180
+ %Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[-/]?[0-9]+[-/][0-9]+ *$' THEN (#{col_expr}::date):text ELSE NULL END},
181
+ MM::DataType::TYPE_Date,
183
182
  false
184
183
  )
185
184
  end
186
185
 
187
- def split_on_separators expr, seps = ',\\\\|'
186
+ def split_on_separators expr, seps = ',|'
188
187
  Expression.new(
189
- %Q{regexp_split_to_table(#{expr}, E'#{seps}')},
188
+ %Q{regexp_split_to_table(#{expr}, E'[#{seps}]')},
190
189
  MM::DataType::TYPE_String, true, true
191
190
  )
192
191
  end
@@ -194,7 +193,7 @@ module ActiveFacts
194
193
  # Extract separated numbers, remove non-digits, take the last 8 (removing area codes etc)
195
194
  def phone_numbers expr
196
195
  Expression.new(
197
- %Q{right(#{split_on_separators(%Q{regexp_replace(#{expr}, '[^0-9]+', '', 'g')})}, 8)},
196
+ %Q{right(regexp_replace(#{split_on_separators(expr)}, '[^0-9]+', '', 'g'), 8)},
198
197
  MM::DataType::TYPE_String,
199
198
  true
200
199
  )
@@ -237,26 +236,11 @@ module ActiveFacts
237
236
  end
238
237
 
239
238
  def phonetics expr
240
- if expr.is_array
241
- [
242
- Expression.new(
243
- %Q{dmetaphone(#{expr})},
244
- MM::DataType::TYPE_String,
245
- expr.is_mandatory
246
- ),
247
- Expression.new(
248
- %Q{dmetaphone_alt(#{expr})},
249
- MM::DataType::TYPE_String,
250
- expr.is_mandatory
251
- )
252
- ]
253
- else
254
- Expression.new(
255
- %Q{unnest(ARRAY[dmetaphone(#{expr}), dmetaphone_alt(#{expr})])},
256
- MM::DataType::TYPE_String,
257
- expr.is_mandatory
258
- )
259
- end
239
+ Expression.new(
240
+ %Q{unnest(ARRAY[dmetaphone(#{expr}), dmetaphone_alt(#{expr})])},
241
+ MM::DataType::TYPE_String,
242
+ expr.is_mandatory
243
+ )
260
244
  end
261
245
 
262
246
  # Reserved words cannot be used anywhere without quoting.
@@ -264,7 +248,7 @@ module ActiveFacts
264
248
  # Both lists here are added to the supertype's lists
265
249
  def reserved_words
266
250
  @postgres_reserved_words ||= %w{
267
- ANALYSE ANALYZE LIMIT PLACING RETURNING VARIADIC
251
+ ANALYSE ANALYZE LIMIT PLACING RETURNING SYMMETRIC VARIADIC
268
252
  }
269
253
  super + @postgres_reserved_words
270
254
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: activefacts-compositions
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.19
4
+ version: 1.9.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - Clifford Heath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-12 00:00:00.000000000 Z
11
+ date: 2018-04-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler