activefacts-compositions 1.9.19 → 1.9.20
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +4 -0
- data/lib/activefacts/compositions/version.rb +1 -1
- data/lib/activefacts/generator/etl/unidex.rb +77 -67
- data/lib/activefacts/generator/traits/sql/postgres.rb +17 -33
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 074d060502e336f08e6e114ace5990b6d890f890
|
4
|
+
data.tar.gz: 4ddff75043dc0ce04ef5651fe69b105bee492827
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cfb954b0e3a018ca4bfc959644e2b8780e98789f500dbc9891629c299a8bc3e641c5830a9ae796938f57be90e65909487154fce22e70700b0bac03a653cf2152
|
7
|
+
data.tar.gz: e17f5f23cffeecd28f859a2a71bccfd1306dc2da94aa2a79b2eb34f29c5e121746156fe6fb63b6c2e275290c5b36081bf48834b65825eea71d327d4de29c0d95
|
data/Gemfile
CHANGED
@@ -24,9 +24,7 @@ module ActiveFacts
|
|
24
24
|
options.
|
25
25
|
merge(
|
26
26
|
{
|
27
|
-
dialect: [String, "SQL Dialect to use"]
|
28
|
-
value_width: [Integer, "Number of characters to index from long values"],
|
29
|
-
phonetic_confidence: [Integer, "Percentage confidence for a phonetic match"],
|
27
|
+
dialect: [String, "SQL Dialect to use"]
|
30
28
|
}
|
31
29
|
)
|
32
30
|
end
|
@@ -49,9 +47,6 @@ module ActiveFacts
|
|
49
47
|
end
|
50
48
|
|
51
49
|
def process_options options
|
52
|
-
@value_width = (options.delete('value_width') || 32).to_i
|
53
|
-
@phonetic_confidence = (options.delete('phonetic_confidence') || 40).to_i
|
54
|
-
|
55
50
|
super
|
56
51
|
end
|
57
52
|
|
@@ -139,7 +134,7 @@ module ActiveFacts
|
|
139
134
|
|
140
135
|
def generate_indicator leaf
|
141
136
|
nil # REVISIT: Do we need anything here?
|
142
|
-
# select
|
137
|
+
# select(leaf.root, safe_column_name(leaf), 1, column_name(leaf))
|
143
138
|
end
|
144
139
|
|
145
140
|
# This foreign key connects two composites (tables)
|
@@ -231,57 +226,46 @@ module ActiveFacts
|
|
231
226
|
when MM::DataType::TYPE_Char,
|
232
227
|
MM::DataType::TYPE_String,
|
233
228
|
MM::DataType::TYPE_Text
|
234
|
-
# Produce a
|
229
|
+
# Produce a select yielding values for the requested search type
|
235
230
|
search_methods.flat_map do |sm|
|
236
231
|
case sm
|
237
232
|
when 'none' # Do not index this value
|
238
233
|
nil
|
239
234
|
|
240
235
|
when 'simple' # Disregard white-space only
|
241
|
-
select(composite,
|
236
|
+
select(composite, col_expr, 'simple', source_field)
|
242
237
|
|
243
|
-
when 'alpha'
|
244
|
-
|
245
|
-
truncated = truncate(as_alpha(col_expr), @value_width)
|
246
|
-
select(
|
247
|
-
composite, truncated, sm, source_field,
|
248
|
-
"CASE WHEN #{truncated} = #{col_expr} THEN 1.0 ELSE 0.95 END" # Maybe exact match.
|
249
|
-
)
|
238
|
+
when 'alpha' # Strip white space and punctuation, just use alphabetic characters
|
239
|
+
select(composite, as_alpha(col_expr), sm, source_field)
|
250
240
|
|
251
|
-
when 'phonetic' # Use phonetic matching as well as trigrams
|
252
|
-
|
253
|
-
select(composite, phonetics(col_expr), 'phonetic', source_field, @phonetic_confidence/100.0, true)
|
241
|
+
when 'phonetic' # Use phonetic matching as well as trigrams and alpha
|
242
|
+
select(composite, as_alpha(col_expr), 'phonetic', source_field, phonetics(col_expr))
|
254
243
|
|
255
244
|
when 'words' # Break the text into words and match each word like alpha
|
256
|
-
|
257
|
-
select(composite, truncated, sm, source_field, 0.90, true)
|
245
|
+
select(composite, unnest(as_words(col_expr)), sm, source_field)
|
258
246
|
|
259
247
|
when 'names' # Break the text into words and match each word like phonetic
|
260
|
-
|
261
|
-
|
262
|
-
phonetics(truncated).map do |phonetic|
|
263
|
-
select(composite, phonetic, 'names', source_field, @phonetic_confidence/100.0, true)
|
264
|
-
end
|
248
|
+
value = unnest(as_words(col_expr, "''-")) # N.B. ' is doubled for SQL
|
249
|
+
phonetic_select(value, select(composite, value, 'names', source_field))
|
265
250
|
|
266
251
|
when 'text' # Index a large text field using significant words and phrases
|
267
252
|
nil # REVISIT: Implement this type
|
268
253
|
|
269
254
|
when 'number' # Cast to number and back to text to canonicalise the value;
|
270
|
-
# If
|
271
|
-
|
272
|
-
select(composite,
|
255
|
+
# If it doesn't look like a number, we don't index it.
|
256
|
+
value = number_or_null(col_expr)
|
257
|
+
select(composite, value, 'number', source_field, nil, ["#{value} IS NOT NULL"])
|
273
258
|
|
274
259
|
when 'phone' # Phone numbers; split, strip each to digits, take the last 8 of each
|
275
|
-
select(composite, phone_numbers(col_expr), 'phone', source_field
|
260
|
+
select(composite, phone_numbers(col_expr), 'phone', source_field)
|
276
261
|
|
277
262
|
when 'email' # Use a regexp to find email addresses in this field
|
278
|
-
select(composite,
|
263
|
+
select(composite, email_addresses(col_expr), 'email', source_field)
|
279
264
|
|
280
|
-
when 'date' # Convert string to standard date format
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
)
|
265
|
+
when 'date' # REVISIT: Convert string to standard date format
|
266
|
+
# If it doesn't look like a date, we don't index it.
|
267
|
+
value = date_or_null(col_expr)
|
268
|
+
select(composite, value, 'date', source_field, nil, ["#{value} IS NOT NULL"])
|
285
269
|
|
286
270
|
else
|
287
271
|
$stderrs.puts "Unknown search method #{sm}"
|
@@ -295,24 +279,22 @@ module ActiveFacts
|
|
295
279
|
MM::DataType::TYPE_Real,
|
296
280
|
MM::DataType::TYPE_Decimal,
|
297
281
|
MM::DataType::TYPE_Money
|
298
|
-
|
299
|
-
# REVISIT: This is a dumb thing to do.
|
300
|
-
select(composite, lexical_decimal(col_expr, @value_width, value_type.scale), 'simple', source_field, 1)
|
282
|
+
select(composite, col_expr, 'simple', source_field)
|
301
283
|
|
302
284
|
when MM::DataType::TYPE_Date
|
303
285
|
# Produce an ISO representation that sorts lexically (YYYY-MM-DD)
|
304
286
|
# REVISIT: Support search methods here
|
305
|
-
select(composite, lexical_date(col_expr), '
|
287
|
+
select(composite, lexical_date(col_expr), 'date', source_field)
|
306
288
|
|
307
289
|
when MM::DataType::TYPE_DateTime,
|
308
290
|
MM::DataType::TYPE_Timestamp
|
309
291
|
# Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
|
310
292
|
# REVISIT: Support search methods here
|
311
|
-
select(composite, lexical_datetime(col_expr), '
|
293
|
+
select(composite, lexical_datetime(col_expr), 'datetime', source_field)
|
312
294
|
|
313
295
|
when MM::DataType::TYPE_Time
|
314
296
|
# Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
|
315
|
-
select(composite, lexical_time(col_expr), '
|
297
|
+
select(composite, lexical_time(col_expr), 'time', source_field)
|
316
298
|
|
317
299
|
when MM::DataType::TYPE_Binary
|
318
300
|
nil # No indexing applied
|
@@ -325,35 +307,63 @@ module ActiveFacts
|
|
325
307
|
name.words.send(@column_case)*@column_joiner
|
326
308
|
end
|
327
309
|
|
328
|
-
def
|
310
|
+
def field_names
|
311
|
+
@field_names ||=
|
312
|
+
%w{Value Phonetic Processing SourceTable SourceField LoadBatchID RecordGUID}.
|
313
|
+
map{|n| stylise_column_name(n)}
|
314
|
+
end
|
315
|
+
|
316
|
+
def phonetic_select expression, select
|
317
|
+
field_list =
|
318
|
+
field_names.
|
319
|
+
map do |n|
|
320
|
+
if n =~ /Phonetic/i
|
321
|
+
phonetics(Expression.new(stylise_column_name('Value'), MM::DataType::TYPE_String, true)).to_s + " AS #{n}"
|
322
|
+
else
|
323
|
+
n
|
324
|
+
end
|
325
|
+
end.
|
326
|
+
join(",\n\t")
|
327
|
+
|
328
|
+
%Q{
|
329
|
+
SELECT DISTINCT
|
330
|
+
<FIELDS>
|
331
|
+
FROM (<SUB>
|
332
|
+
) AS s}.
|
333
|
+
unindent.
|
334
|
+
sub(/<FIELDS>/, field_list).
|
335
|
+
sub(/<SUB>/, select.gsub(/\n/,"\n\t"))
|
336
|
+
end
|
337
|
+
|
338
|
+
def select composite, expression, processing, source_field, phonetic = nil, conditions = []
|
329
339
|
# These fields are in order of index precedence, to co-locate
|
330
340
|
# comparable values regardless of source record type or column
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
341
|
+
|
342
|
+
select_list =
|
343
|
+
[ expression.to_s,
|
344
|
+
phonetic ? phonetic.to_s : 'NULL',
|
345
|
+
"'"+processing+"'::text",
|
346
|
+
"'"+safe_table_name(composite)+"'::text",
|
347
|
+
"'"+source_field+"'::text",
|
348
|
+
nil,
|
349
|
+
nil,
|
350
|
+
].zip(field_names).
|
351
|
+
map(&:compact).
|
352
|
+
map{|a| a * ' AS '}.
|
353
|
+
join(%q{,
|
354
|
+
})
|
355
|
+
where =
|
356
|
+
if conditions.empty?
|
357
|
+
''
|
358
|
+
else
|
359
|
+
"\nWHERE\t#{conditions*"\n AND\t"}"
|
360
|
+
end
|
340
361
|
select = %Q{
|
341
|
-
SELECT
|
342
|
-
|
343
|
-
#{expression_text} AS #{value_name},
|
344
|
-
#{load_batch_id_name},
|
345
|
-
#{confidence} AS #{confidence_name},
|
346
|
-
#{record_guid_name},
|
347
|
-
'#{safe_table_name(composite)}' AS #{source_table_name},
|
348
|
-
'#{source_field}' AS #{source_field_name}
|
362
|
+
SELECT DISTINCT
|
363
|
+
#{select_list}
|
349
364
|
FROM #{safe_table_name(composite)}}.
|
350
|
-
unindent
|
351
|
-
|
352
|
-
if where.empty?
|
353
|
-
select
|
354
|
-
else
|
355
|
-
"\nSELECT * FROM (#{select}\n) AS s WHERE #{where*' AND '}"
|
356
|
-
end
|
365
|
+
unindent+
|
366
|
+
where
|
357
367
|
|
358
368
|
end
|
359
369
|
|
@@ -166,27 +166,26 @@ module ActiveFacts
|
|
166
166
|
expr
|
167
167
|
end
|
168
168
|
|
169
|
-
|
170
|
-
|
171
|
-
fraction_pattern = scale > 0 ? '.'+'0'*scale : ''
|
169
|
+
def number_or_null expr
|
170
|
+
# This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4)
|
172
171
|
Expression.new(
|
173
|
-
|
174
|
-
MM::DataType::
|
175
|
-
|
172
|
+
%Q{CASE WHEN #{expr} ~ '^ *[-+]?([0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{expr}::numeric)::text ELSE NULL END},
|
173
|
+
MM::DataType::TYPE_Real,
|
174
|
+
false
|
176
175
|
)
|
177
176
|
end
|
178
177
|
|
179
|
-
def
|
178
|
+
def date_or_null expr
|
180
179
|
Expression.new(
|
181
|
-
%Q{CASE WHEN #{
|
182
|
-
MM::DataType::
|
180
|
+
%Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[-/]?[0-9]+[-/][0-9]+ *$' THEN (#{col_expr}::date):text ELSE NULL END},
|
181
|
+
MM::DataType::TYPE_Date,
|
183
182
|
false
|
184
183
|
)
|
185
184
|
end
|
186
185
|
|
187
|
-
def split_on_separators expr, seps = '
|
186
|
+
def split_on_separators expr, seps = ',|'
|
188
187
|
Expression.new(
|
189
|
-
%Q{regexp_split_to_table(#{expr}, E'#{seps}')},
|
188
|
+
%Q{regexp_split_to_table(#{expr}, E'[#{seps}]')},
|
190
189
|
MM::DataType::TYPE_String, true, true
|
191
190
|
)
|
192
191
|
end
|
@@ -194,7 +193,7 @@ module ActiveFacts
|
|
194
193
|
# Extract separated numbers, remove non-digits, take the last 8 (removing area codes etc)
|
195
194
|
def phone_numbers expr
|
196
195
|
Expression.new(
|
197
|
-
%Q{right(#{split_on_separators(
|
196
|
+
%Q{right(regexp_replace(#{split_on_separators(expr)}, '[^0-9]+', '', 'g'), 8)},
|
198
197
|
MM::DataType::TYPE_String,
|
199
198
|
true
|
200
199
|
)
|
@@ -237,26 +236,11 @@ module ActiveFacts
|
|
237
236
|
end
|
238
237
|
|
239
238
|
def phonetics expr
|
240
|
-
|
241
|
-
[
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
expr.is_mandatory
|
246
|
-
),
|
247
|
-
Expression.new(
|
248
|
-
%Q{dmetaphone_alt(#{expr})},
|
249
|
-
MM::DataType::TYPE_String,
|
250
|
-
expr.is_mandatory
|
251
|
-
)
|
252
|
-
]
|
253
|
-
else
|
254
|
-
Expression.new(
|
255
|
-
%Q{unnest(ARRAY[dmetaphone(#{expr}), dmetaphone_alt(#{expr})])},
|
256
|
-
MM::DataType::TYPE_String,
|
257
|
-
expr.is_mandatory
|
258
|
-
)
|
259
|
-
end
|
239
|
+
Expression.new(
|
240
|
+
%Q{unnest(ARRAY[dmetaphone(#{expr}), dmetaphone_alt(#{expr})])},
|
241
|
+
MM::DataType::TYPE_String,
|
242
|
+
expr.is_mandatory
|
243
|
+
)
|
260
244
|
end
|
261
245
|
|
262
246
|
# Reserved words cannot be used anywhere without quoting.
|
@@ -264,7 +248,7 @@ module ActiveFacts
|
|
264
248
|
# Both lists here are added to the supertype's lists
|
265
249
|
def reserved_words
|
266
250
|
@postgres_reserved_words ||= %w{
|
267
|
-
ANALYSE ANALYZE LIMIT PLACING RETURNING VARIADIC
|
251
|
+
ANALYSE ANALYZE LIMIT PLACING RETURNING SYMMETRIC VARIADIC
|
268
252
|
}
|
269
253
|
super + @postgres_reserved_words
|
270
254
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: activefacts-compositions
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Clifford Heath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|