activefacts-compositions 1.9.19 → 1.9.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -0
- data/lib/activefacts/compositions/version.rb +1 -1
- data/lib/activefacts/generator/etl/unidex.rb +77 -67
- data/lib/activefacts/generator/traits/sql/postgres.rb +17 -33
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 074d060502e336f08e6e114ace5990b6d890f890
|
4
|
+
data.tar.gz: 4ddff75043dc0ce04ef5651fe69b105bee492827
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cfb954b0e3a018ca4bfc959644e2b8780e98789f500dbc9891629c299a8bc3e641c5830a9ae796938f57be90e65909487154fce22e70700b0bac03a653cf2152
|
7
|
+
data.tar.gz: e17f5f23cffeecd28f859a2a71bccfd1306dc2da94aa2a79b2eb34f29c5e121746156fe6fb63b6c2e275290c5b36081bf48834b65825eea71d327d4de29c0d95
|
data/Gemfile
CHANGED
@@ -24,9 +24,7 @@ module ActiveFacts
|
|
24
24
|
options.
|
25
25
|
merge(
|
26
26
|
{
|
27
|
-
dialect: [String, "SQL Dialect to use"]
|
28
|
-
value_width: [Integer, "Number of characters to index from long values"],
|
29
|
-
phonetic_confidence: [Integer, "Percentage confidence for a phonetic match"],
|
27
|
+
dialect: [String, "SQL Dialect to use"]
|
30
28
|
}
|
31
29
|
)
|
32
30
|
end
|
@@ -49,9 +47,6 @@ module ActiveFacts
|
|
49
47
|
end
|
50
48
|
|
51
49
|
def process_options options
|
52
|
-
@value_width = (options.delete('value_width') || 32).to_i
|
53
|
-
@phonetic_confidence = (options.delete('phonetic_confidence') || 40).to_i
|
54
|
-
|
55
50
|
super
|
56
51
|
end
|
57
52
|
|
@@ -139,7 +134,7 @@ module ActiveFacts
|
|
139
134
|
|
140
135
|
def generate_indicator leaf
|
141
136
|
nil # REVISIT: Do we need anything here?
|
142
|
-
# select
|
137
|
+
# select(leaf.root, safe_column_name(leaf), 1, column_name(leaf))
|
143
138
|
end
|
144
139
|
|
145
140
|
# This foreign key connects two composites (tables)
|
@@ -231,57 +226,46 @@ module ActiveFacts
|
|
231
226
|
when MM::DataType::TYPE_Char,
|
232
227
|
MM::DataType::TYPE_String,
|
233
228
|
MM::DataType::TYPE_Text
|
234
|
-
# Produce a
|
229
|
+
# Produce a select yielding values for the requested search type
|
235
230
|
search_methods.flat_map do |sm|
|
236
231
|
case sm
|
237
232
|
when 'none' # Do not index this value
|
238
233
|
nil
|
239
234
|
|
240
235
|
when 'simple' # Disregard white-space only
|
241
|
-
select(composite,
|
236
|
+
select(composite, col_expr, 'simple', source_field)
|
242
237
|
|
243
|
-
when 'alpha'
|
244
|
-
|
245
|
-
truncated = truncate(as_alpha(col_expr), @value_width)
|
246
|
-
select(
|
247
|
-
composite, truncated, sm, source_field,
|
248
|
-
"CASE WHEN #{truncated} = #{col_expr} THEN 1.0 ELSE 0.95 END" # Maybe exact match.
|
249
|
-
)
|
238
|
+
when 'alpha' # Strip white space and punctuation, just use alphabetic characters
|
239
|
+
select(composite, as_alpha(col_expr), sm, source_field)
|
250
240
|
|
251
|
-
when 'phonetic' # Use phonetic matching as well as trigrams
|
252
|
-
|
253
|
-
select(composite, phonetics(col_expr), 'phonetic', source_field, @phonetic_confidence/100.0, true)
|
241
|
+
when 'phonetic' # Use phonetic matching as well as trigrams and alpha
|
242
|
+
select(composite, as_alpha(col_expr), 'phonetic', source_field, phonetics(col_expr))
|
254
243
|
|
255
244
|
when 'words' # Break the text into words and match each word like alpha
|
256
|
-
|
257
|
-
select(composite, truncated, sm, source_field, 0.90, true)
|
245
|
+
select(composite, unnest(as_words(col_expr)), sm, source_field)
|
258
246
|
|
259
247
|
when 'names' # Break the text into words and match each word like phonetic
|
260
|
-
|
261
|
-
|
262
|
-
phonetics(truncated).map do |phonetic|
|
263
|
-
select(composite, phonetic, 'names', source_field, @phonetic_confidence/100.0, true)
|
264
|
-
end
|
248
|
+
value = unnest(as_words(col_expr, "''-")) # N.B. ' is doubled for SQL
|
249
|
+
phonetic_select(value, select(composite, value, 'names', source_field))
|
265
250
|
|
266
251
|
when 'text' # Index a large text field using significant words and phrases
|
267
252
|
nil # REVISIT: Implement this type
|
268
253
|
|
269
254
|
when 'number' # Cast to number and back to text to canonicalise the value;
|
270
|
-
# If
|
271
|
-
|
272
|
-
select(composite,
|
255
|
+
# If it doesn't look like a number, we don't index it.
|
256
|
+
value = number_or_null(col_expr)
|
257
|
+
select(composite, value, 'number', source_field, nil, ["#{value} IS NOT NULL"])
|
273
258
|
|
274
259
|
when 'phone' # Phone numbers; split, strip each to digits, take the last 8 of each
|
275
|
-
select(composite, phone_numbers(col_expr), 'phone', source_field
|
260
|
+
select(composite, phone_numbers(col_expr), 'phone', source_field)
|
276
261
|
|
277
262
|
when 'email' # Use a regexp to find email addresses in this field
|
278
|
-
select(composite,
|
263
|
+
select(composite, email_addresses(col_expr), 'email', source_field)
|
279
264
|
|
280
|
-
when 'date' # Convert string to standard date format
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
)
|
265
|
+
when 'date' # REVISIT: Convert string to standard date format
|
266
|
+
# If it doesn't look like a date, we don't index it.
|
267
|
+
value = date_or_null(col_expr)
|
268
|
+
select(composite, value, 'date', source_field, nil, ["#{value} IS NOT NULL"])
|
285
269
|
|
286
270
|
else
|
287
271
|
$stderrs.puts "Unknown search method #{sm}"
|
@@ -295,24 +279,22 @@ module ActiveFacts
|
|
295
279
|
MM::DataType::TYPE_Real,
|
296
280
|
MM::DataType::TYPE_Decimal,
|
297
281
|
MM::DataType::TYPE_Money
|
298
|
-
|
299
|
-
# REVISIT: This is a dumb thing to do.
|
300
|
-
select(composite, lexical_decimal(col_expr, @value_width, value_type.scale), 'simple', source_field, 1)
|
282
|
+
select(composite, col_expr, 'simple', source_field)
|
301
283
|
|
302
284
|
when MM::DataType::TYPE_Date
|
303
285
|
# Produce an ISO representation that sorts lexically (YYYY-MM-DD)
|
304
286
|
# REVISIT: Support search methods here
|
305
|
-
select(composite, lexical_date(col_expr), '
|
287
|
+
select(composite, lexical_date(col_expr), 'date', source_field)
|
306
288
|
|
307
289
|
when MM::DataType::TYPE_DateTime,
|
308
290
|
MM::DataType::TYPE_Timestamp
|
309
291
|
# Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
|
310
292
|
# REVISIT: Support search methods here
|
311
|
-
select(composite, lexical_datetime(col_expr), '
|
293
|
+
select(composite, lexical_datetime(col_expr), 'datetime', source_field)
|
312
294
|
|
313
295
|
when MM::DataType::TYPE_Time
|
314
296
|
# Produce an ISO representation that sorts lexically (YYYY-MM-DD HH:mm:ss)
|
315
|
-
select(composite, lexical_time(col_expr), '
|
297
|
+
select(composite, lexical_time(col_expr), 'time', source_field)
|
316
298
|
|
317
299
|
when MM::DataType::TYPE_Binary
|
318
300
|
nil # No indexing applied
|
@@ -325,35 +307,63 @@ module ActiveFacts
|
|
325
307
|
name.words.send(@column_case)*@column_joiner
|
326
308
|
end
|
327
309
|
|
328
|
-
def
|
310
|
+
def field_names
|
311
|
+
@field_names ||=
|
312
|
+
%w{Value Phonetic Processing SourceTable SourceField LoadBatchID RecordGUID}.
|
313
|
+
map{|n| stylise_column_name(n)}
|
314
|
+
end
|
315
|
+
|
316
|
+
def phonetic_select expression, select
|
317
|
+
field_list =
|
318
|
+
field_names.
|
319
|
+
map do |n|
|
320
|
+
if n =~ /Phonetic/i
|
321
|
+
phonetics(Expression.new(stylise_column_name('Value'), MM::DataType::TYPE_String, true)).to_s + " AS #{n}"
|
322
|
+
else
|
323
|
+
n
|
324
|
+
end
|
325
|
+
end.
|
326
|
+
join(",\n\t")
|
327
|
+
|
328
|
+
%Q{
|
329
|
+
SELECT DISTINCT
|
330
|
+
<FIELDS>
|
331
|
+
FROM (<SUB>
|
332
|
+
) AS s}.
|
333
|
+
unindent.
|
334
|
+
sub(/<FIELDS>/, field_list).
|
335
|
+
sub(/<SUB>/, select.gsub(/\n/,"\n\t"))
|
336
|
+
end
|
337
|
+
|
338
|
+
def select composite, expression, processing, source_field, phonetic = nil, conditions = []
|
329
339
|
# These fields are in order of index precedence, to co-locate
|
330
340
|
# comparable values regardless of source record type or column
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
341
|
+
|
342
|
+
select_list =
|
343
|
+
[ expression.to_s,
|
344
|
+
phonetic ? phonetic.to_s : 'NULL',
|
345
|
+
"'"+processing+"'::text",
|
346
|
+
"'"+safe_table_name(composite)+"'::text",
|
347
|
+
"'"+source_field+"'::text",
|
348
|
+
nil,
|
349
|
+
nil,
|
350
|
+
].zip(field_names).
|
351
|
+
map(&:compact).
|
352
|
+
map{|a| a * ' AS '}.
|
353
|
+
join(%q{,
|
354
|
+
})
|
355
|
+
where =
|
356
|
+
if conditions.empty?
|
357
|
+
''
|
358
|
+
else
|
359
|
+
"\nWHERE\t#{conditions*"\n AND\t"}"
|
360
|
+
end
|
340
361
|
select = %Q{
|
341
|
-
SELECT
|
342
|
-
|
343
|
-
#{expression_text} AS #{value_name},
|
344
|
-
#{load_batch_id_name},
|
345
|
-
#{confidence} AS #{confidence_name},
|
346
|
-
#{record_guid_name},
|
347
|
-
'#{safe_table_name(composite)}' AS #{source_table_name},
|
348
|
-
'#{source_field}' AS #{source_field_name}
|
362
|
+
SELECT DISTINCT
|
363
|
+
#{select_list}
|
349
364
|
FROM #{safe_table_name(composite)}}.
|
350
|
-
unindent
|
351
|
-
|
352
|
-
if where.empty?
|
353
|
-
select
|
354
|
-
else
|
355
|
-
"\nSELECT * FROM (#{select}\n) AS s WHERE #{where*' AND '}"
|
356
|
-
end
|
365
|
+
unindent+
|
366
|
+
where
|
357
367
|
|
358
368
|
end
|
359
369
|
|
@@ -166,27 +166,26 @@ module ActiveFacts
|
|
166
166
|
expr
|
167
167
|
end
|
168
168
|
|
169
|
-
|
170
|
-
|
171
|
-
fraction_pattern = scale > 0 ? '.'+'0'*scale : ''
|
169
|
+
def number_or_null expr
|
170
|
+
# This doesn't handle all valid Postgres numeric literals (e.g. 2.3e-4)
|
172
171
|
Expression.new(
|
173
|
-
|
174
|
-
MM::DataType::
|
175
|
-
|
172
|
+
%Q{CASE WHEN #{expr} ~ '^ *[-+]?([0-9]+[.]?[0-9]*|[.][0-9]+) *$' THEN (#{expr}::numeric)::text ELSE NULL END},
|
173
|
+
MM::DataType::TYPE_Real,
|
174
|
+
false
|
176
175
|
)
|
177
176
|
end
|
178
177
|
|
179
|
-
def
|
178
|
+
def date_or_null expr
|
180
179
|
Expression.new(
|
181
|
-
%Q{CASE WHEN #{
|
182
|
-
MM::DataType::
|
180
|
+
%Q{CASE WHEN #{col_expr} ~ '^ *[0-9]+[-/]?[0-9]+[-/][0-9]+ *$' THEN (#{col_expr}::date):text ELSE NULL END},
|
181
|
+
MM::DataType::TYPE_Date,
|
183
182
|
false
|
184
183
|
)
|
185
184
|
end
|
186
185
|
|
187
|
-
def split_on_separators expr, seps = '
|
186
|
+
def split_on_separators expr, seps = ',|'
|
188
187
|
Expression.new(
|
189
|
-
%Q{regexp_split_to_table(#{expr}, E'#{seps}')},
|
188
|
+
%Q{regexp_split_to_table(#{expr}, E'[#{seps}]')},
|
190
189
|
MM::DataType::TYPE_String, true, true
|
191
190
|
)
|
192
191
|
end
|
@@ -194,7 +193,7 @@ module ActiveFacts
|
|
194
193
|
# Extract separated numbers, remove non-digits, take the last 8 (removing area codes etc)
|
195
194
|
def phone_numbers expr
|
196
195
|
Expression.new(
|
197
|
-
%Q{right(#{split_on_separators(
|
196
|
+
%Q{right(regexp_replace(#{split_on_separators(expr)}, '[^0-9]+', '', 'g'), 8)},
|
198
197
|
MM::DataType::TYPE_String,
|
199
198
|
true
|
200
199
|
)
|
@@ -237,26 +236,11 @@ module ActiveFacts
|
|
237
236
|
end
|
238
237
|
|
239
238
|
def phonetics expr
|
240
|
-
|
241
|
-
[
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
expr.is_mandatory
|
246
|
-
),
|
247
|
-
Expression.new(
|
248
|
-
%Q{dmetaphone_alt(#{expr})},
|
249
|
-
MM::DataType::TYPE_String,
|
250
|
-
expr.is_mandatory
|
251
|
-
)
|
252
|
-
]
|
253
|
-
else
|
254
|
-
Expression.new(
|
255
|
-
%Q{unnest(ARRAY[dmetaphone(#{expr}), dmetaphone_alt(#{expr})])},
|
256
|
-
MM::DataType::TYPE_String,
|
257
|
-
expr.is_mandatory
|
258
|
-
)
|
259
|
-
end
|
239
|
+
Expression.new(
|
240
|
+
%Q{unnest(ARRAY[dmetaphone(#{expr}), dmetaphone_alt(#{expr})])},
|
241
|
+
MM::DataType::TYPE_String,
|
242
|
+
expr.is_mandatory
|
243
|
+
)
|
260
244
|
end
|
261
245
|
|
262
246
|
# Reserved words cannot be used anywhere without quoting.
|
@@ -264,7 +248,7 @@ module ActiveFacts
|
|
264
248
|
# Both lists here are added to the supertype's lists
|
265
249
|
def reserved_words
|
266
250
|
@postgres_reserved_words ||= %w{
|
267
|
-
ANALYSE ANALYZE LIMIT PLACING RETURNING VARIADIC
|
251
|
+
ANALYSE ANALYZE LIMIT PLACING RETURNING SYMMETRIC VARIADIC
|
268
252
|
}
|
269
253
|
super + @postgres_reserved_words
|
270
254
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: activefacts-compositions
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Clifford Heath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-04-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|