loose_tight_dictionary 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/benchmark/memory.rb +2 -2
- data/examples/bts_aircraft/test_bts_aircraft.rb +2 -2
- data/lib/loose_tight_dictionary.rb +21 -16
- data/lib/loose_tight_dictionary/blocking.rb +7 -7
- data/lib/loose_tight_dictionary/identity.rb +1 -3
- data/lib/loose_tight_dictionary/tightener.rb +1 -3
- data/lib/loose_tight_dictionary/version.rb +1 -1
- data/loose_tight_dictionary.gemspec +1 -0
- data/test/test_blocking.rb +2 -2
- data/test/test_identity.rb +15 -0
- data/test/test_loose_tight_dictionary.rb +5 -9
- metadata +42 -8
- data/lib/loose_tight_dictionary/extract_regexp.rb +0 -30
- data/test/test_extract_regexp.rb +0 -18
data/benchmark/memory.rb
CHANGED
@@ -22,7 +22,7 @@ HAYSTACK_READER = lambda { |record| "#{record['Manufacturer']} #{record['Long Na
|
|
22
22
|
|
23
23
|
# Whether to even bother trying to find a match for something without an explicit block
|
24
24
|
# (Example) False, which is the default, which means we have more work to do
|
25
|
-
|
25
|
+
MUST_MATCH_BLOCKING = false
|
26
26
|
|
27
27
|
# Blockings
|
28
28
|
# (Example) We made these by trial and error
|
@@ -38,7 +38,7 @@ IDENTITIES = RemoteTable.new(:url => "file://#{File.expand_path("../../examples/
|
|
38
38
|
|
39
39
|
FINAL_OPTIONS = {
|
40
40
|
:haystack_reader => HAYSTACK_READER,
|
41
|
-
:
|
41
|
+
:must_match_blocking => MUST_MATCH_BLOCKING,
|
42
42
|
:tighteners => TIGHTENERS,
|
43
43
|
:identities => IDENTITIES,
|
44
44
|
:blockings => BLOCKINGS
|
@@ -20,7 +20,7 @@ HAYSTACK_READER = lambda { |record| "#{record['Manufacturer']} #{record['Long Na
|
|
20
20
|
|
21
21
|
# Whether to even bother trying to find a match for something without an explicit block
|
22
22
|
# (Example) False, which is the default, which means we have more work to do
|
23
|
-
|
23
|
+
MUST_MATCH_BLOCKING = false
|
24
24
|
|
25
25
|
# Blockings
|
26
26
|
# (Example) We made these by trial and error
|
@@ -64,7 +64,7 @@ NEGATIVES = RemoteTable.new :url => "file://#{File.expand_path("../negatives.csv
|
|
64
64
|
|
65
65
|
FINAL_OPTIONS = {
|
66
66
|
:haystack_reader => HAYSTACK_READER,
|
67
|
-
:
|
67
|
+
:must_match_blocking => MUST_MATCH_BLOCKING,
|
68
68
|
:tighteners => TIGHTENERS,
|
69
69
|
:identities => IDENTITIES,
|
70
70
|
:blockings => BLOCKINGS
|
@@ -7,10 +7,10 @@ require 'active_support/version'
|
|
7
7
|
}.each do |active_support_3_requirement|
|
8
8
|
require active_support_3_requirement
|
9
9
|
end if ::ActiveSupport::VERSION::MAJOR == 3
|
10
|
+
require 'to_regexp'
|
10
11
|
|
11
12
|
# See the README for more information.
|
12
13
|
class LooseTightDictionary
|
13
|
-
autoload :ExtractRegexp, 'loose_tight_dictionary/extract_regexp'
|
14
14
|
autoload :Tightener, 'loose_tight_dictionary/tightener'
|
15
15
|
autoload :Blocking, 'loose_tight_dictionary/blocking'
|
16
16
|
autoload :Identity, 'loose_tight_dictionary/identity'
|
@@ -48,7 +48,6 @@ class LooseTightDictionary
|
|
48
48
|
find needle, options
|
49
49
|
end
|
50
50
|
|
51
|
-
# todo fix record.record confusion (should be wrapper.record or smth)
|
52
51
|
def find(needle, options = {})
|
53
52
|
raise Freed if freed?
|
54
53
|
free_last_result
|
@@ -69,7 +68,7 @@ class LooseTightDictionary
|
|
69
68
|
last_result.needle = needle
|
70
69
|
end
|
71
70
|
|
72
|
-
if
|
71
|
+
if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
|
73
72
|
if find_all
|
74
73
|
return []
|
75
74
|
else
|
@@ -77,25 +76,31 @@ class LooseTightDictionary
|
|
77
76
|
end
|
78
77
|
end
|
79
78
|
|
80
|
-
encompassed, unencompassed = if
|
81
|
-
haystack.partition do |
|
79
|
+
encompassed, unencompassed = if blockings.any?
|
80
|
+
haystack.partition do |straw|
|
82
81
|
blockings.any? do |blocking|
|
83
|
-
blocking.encompass?(needle,
|
82
|
+
blocking.encompass?(needle, straw) == true
|
84
83
|
end
|
85
84
|
end
|
86
85
|
else
|
87
86
|
[ haystack.dup, [] ]
|
88
87
|
end
|
89
88
|
|
89
|
+
# special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
|
90
|
+
if encompassed.none?
|
91
|
+
encompassed = unencompassed
|
92
|
+
unencompassed = []
|
93
|
+
end
|
94
|
+
|
90
95
|
if gather_last_result
|
91
96
|
last_result.encompassed = encompassed
|
92
97
|
last_result.unencompassed = unencompassed
|
93
98
|
end
|
94
99
|
|
95
100
|
possibly_identical, certainly_different = if identities.any?
|
96
|
-
encompassed.partition do |
|
101
|
+
encompassed.partition do |straw|
|
97
102
|
identities.all? do |identity|
|
98
|
-
answer = identity.identical? needle,
|
103
|
+
answer = identity.identical? needle, straw
|
99
104
|
answer.nil? or answer == true
|
100
105
|
end
|
101
106
|
end
|
@@ -109,24 +114,24 @@ class LooseTightDictionary
|
|
109
114
|
end
|
110
115
|
|
111
116
|
if find_all
|
112
|
-
return possibly_identical.map { |
|
117
|
+
return possibly_identical.map { |straw| straw.record }
|
113
118
|
end
|
114
119
|
|
115
|
-
similarities = possibly_identical.map do |
|
116
|
-
needle.similarity
|
120
|
+
similarities = possibly_identical.map do |straw|
|
121
|
+
needle.similarity straw
|
117
122
|
end.sort
|
118
123
|
|
119
124
|
best_similarity = similarities[-1]
|
120
|
-
|
125
|
+
straw = best_similarity.wrapper2
|
121
126
|
score = best_similarity.best_score.to_f
|
122
127
|
|
123
128
|
if gather_last_result
|
124
129
|
last_result.similarities = similarities
|
125
|
-
last_result.record =
|
130
|
+
last_result.record = straw.record
|
126
131
|
last_result.score = score
|
127
132
|
end
|
128
133
|
|
129
|
-
|
134
|
+
straw.record
|
130
135
|
end
|
131
136
|
|
132
137
|
# Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
|
@@ -188,8 +193,8 @@ class LooseTightDictionary
|
|
188
193
|
options[:haystack_reader]
|
189
194
|
end
|
190
195
|
|
191
|
-
def
|
192
|
-
options[:
|
196
|
+
def must_match_blocking
|
197
|
+
options[:must_match_blocking] || false
|
193
198
|
end
|
194
199
|
|
195
200
|
def tighteners
|
@@ -7,22 +7,22 @@ class LooseTightDictionary
|
|
7
7
|
# A blocking (as in a grouping) comes into effect when a str matches.
|
8
8
|
# Then the needle must also match the blocking's regexp.
|
9
9
|
class Blocking
|
10
|
-
include ExtractRegexp
|
11
|
-
|
12
10
|
attr_reader :regexp
|
13
11
|
|
14
12
|
def initialize(regexp_or_str)
|
15
|
-
@regexp =
|
13
|
+
@regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
|
14
|
+
end
|
15
|
+
|
16
|
+
def match?(str)
|
17
|
+
!!(regexp.match(str))
|
16
18
|
end
|
17
19
|
|
18
20
|
# If a blocking "encompasses" two strings, that means they both fit into it.
|
19
21
|
#
|
20
22
|
# Returns false if they certainly don't fit this blocking.
|
21
23
|
# Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
|
22
|
-
def encompass?(str1, str2
|
23
|
-
if str2
|
24
|
-
!!(regexp.match(str1))
|
25
|
-
elsif str2_match_data = regexp.match(str2)
|
24
|
+
def encompass?(str1, str2)
|
25
|
+
if str2_match_data = regexp.match(str2)
|
26
26
|
if str1_match_data = regexp.match(str1)
|
27
27
|
str2_match_data.captures == str1_match_data.captures
|
28
28
|
else
|
@@ -2,12 +2,10 @@ class LooseTightDictionary
|
|
2
2
|
# Identities take effect when needle and haystack both match a regexp
|
3
3
|
# Then the captured part of the regexp has to match exactly
|
4
4
|
class Identity
|
5
|
-
include ExtractRegexp
|
6
|
-
|
7
5
|
attr_reader :regexp
|
8
6
|
|
9
7
|
def initialize(regexp_or_str)
|
10
|
-
@regexp =
|
8
|
+
@regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
|
11
9
|
end
|
12
10
|
|
13
11
|
# Two strings are "identical" if they both match this identity and the captures are equal.
|
@@ -1,12 +1,10 @@
|
|
1
1
|
class LooseTightDictionary
|
2
2
|
# A tightener just strips a string down to its core
|
3
3
|
class Tightener
|
4
|
-
include ExtractRegexp
|
5
|
-
|
6
4
|
attr_reader :regexp
|
7
5
|
|
8
6
|
def initialize(regexp_or_str)
|
9
|
-
@regexp =
|
7
|
+
@regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
|
10
8
|
end
|
11
9
|
|
12
10
|
# A tightener applies when its regexp matches and captures a new (shorter) string
|
data/test/test_blocking.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
require 'helper'
|
2
2
|
|
3
3
|
class TestBlocking < Test::Unit::TestCase
|
4
|
-
def
|
4
|
+
def test_001_match_one
|
5
5
|
b = LooseTightDictionary::Blocking.new %r{apple}
|
6
|
-
assert_equal true, b.
|
6
|
+
assert_equal true, b.match?('2 apples')
|
7
7
|
end
|
8
8
|
|
9
9
|
def test_002_encompass_both
|
data/test/test_identity.rb
CHANGED
@@ -15,4 +15,19 @@ class TestIdentity < Test::Unit::TestCase
|
|
15
15
|
i = LooseTightDictionary::Identity.new %r{(A)[ ]*(\d)}
|
16
16
|
assert_equal nil, i.identical?('B1', 'A 2foobar')
|
17
17
|
end
|
18
|
+
|
19
|
+
def test_004_regexp
|
20
|
+
i = LooseTightDictionary::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
|
21
|
+
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_005_regexp_from_string
|
25
|
+
i = LooseTightDictionary::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
|
26
|
+
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_006_regexp_from_string_using_slash_delim
|
30
|
+
i = LooseTightDictionary::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
|
31
|
+
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
32
|
+
end
|
18
33
|
end
|
@@ -51,24 +51,20 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
51
51
|
assert_equal 'foo', d.find('baz')
|
52
52
|
end
|
53
53
|
|
54
|
-
def
|
55
|
-
# sanity check
|
54
|
+
def test_009_must_match_blocking
|
56
55
|
d = LooseTightDictionary.new [ 'X' ]
|
57
56
|
assert_equal 'X', d.find('X')
|
58
57
|
assert_equal 'X', d.find('A')
|
59
|
-
# end sanity check
|
60
58
|
|
61
59
|
d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ]
|
62
60
|
assert_equal 'X', d.find('X')
|
63
61
|
assert_equal 'X', d.find('A')
|
64
|
-
|
65
|
-
|
66
|
-
def test_010_strict_blocking
|
67
|
-
d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :strict_blocking => true
|
62
|
+
|
63
|
+
d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
68
64
|
assert_equal 'X', d.find('X')
|
69
65
|
assert_equal nil, d.find('A')
|
70
66
|
end
|
71
|
-
|
67
|
+
|
72
68
|
def test_011_free
|
73
69
|
d = LooseTightDictionary.new %w{ NISSAN HONDA }
|
74
70
|
d.free
|
@@ -78,7 +74,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
|
|
78
74
|
end
|
79
75
|
|
80
76
|
def test_012_find_all
|
81
|
-
d = LooseTightDictionary.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :
|
77
|
+
d = LooseTightDictionary.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
|
82
78
|
assert_equal ['X', 'X22' ], d.find_all('X')
|
83
79
|
assert_equal [], d.find_all('A')
|
84
80
|
end
|
metadata
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: loose_tight_dictionary
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
4
5
|
prerelease:
|
5
|
-
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
6
11
|
platform: ruby
|
7
12
|
authors:
|
8
13
|
- Seamus Abshere
|
@@ -10,8 +15,7 @@ autorequire:
|
|
10
15
|
bindir: bin
|
11
16
|
cert_chain: []
|
12
17
|
|
13
|
-
date: 2011-04-
|
14
|
-
default_executable:
|
18
|
+
date: 2011-04-27 00:00:00 Z
|
15
19
|
dependencies:
|
16
20
|
- !ruby/object:Gem::Dependency
|
17
21
|
name: shoulda
|
@@ -21,6 +25,9 @@ dependencies:
|
|
21
25
|
requirements:
|
22
26
|
- - ">="
|
23
27
|
- !ruby/object:Gem::Version
|
28
|
+
hash: 3
|
29
|
+
segments:
|
30
|
+
- 0
|
24
31
|
version: "0"
|
25
32
|
type: :development
|
26
33
|
version_requirements: *id001
|
@@ -32,6 +39,9 @@ dependencies:
|
|
32
39
|
requirements:
|
33
40
|
- - ">="
|
34
41
|
- !ruby/object:Gem::Version
|
42
|
+
hash: 3
|
43
|
+
segments:
|
44
|
+
- 0
|
35
45
|
version: "0"
|
36
46
|
type: :development
|
37
47
|
version_requirements: *id002
|
@@ -43,6 +53,11 @@ dependencies:
|
|
43
53
|
requirements:
|
44
54
|
- - ">="
|
45
55
|
- !ruby/object:Gem::Version
|
56
|
+
hash: 11
|
57
|
+
segments:
|
58
|
+
- 2
|
59
|
+
- 3
|
60
|
+
- 4
|
46
61
|
version: 2.3.4
|
47
62
|
type: :runtime
|
48
63
|
version_requirements: *id003
|
@@ -54,9 +69,26 @@ dependencies:
|
|
54
69
|
requirements:
|
55
70
|
- - ">="
|
56
71
|
- !ruby/object:Gem::Version
|
72
|
+
hash: 3
|
73
|
+
segments:
|
74
|
+
- 0
|
57
75
|
version: "0"
|
58
76
|
type: :runtime
|
59
77
|
version_requirements: *id004
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: to_regexp
|
80
|
+
prerelease: false
|
81
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
hash: 3
|
87
|
+
segments:
|
88
|
+
- 0
|
89
|
+
version: "0"
|
90
|
+
type: :runtime
|
91
|
+
version_requirements: *id005
|
60
92
|
description: Create dictionaries that link rows between two tables using loose matching (string similarity) by default and tight matching (regexp) by request.
|
61
93
|
email:
|
62
94
|
- seamus@abshere.net
|
@@ -93,7 +125,6 @@ files:
|
|
93
125
|
- examples/icao-bts.xls
|
94
126
|
- lib/loose_tight_dictionary.rb
|
95
127
|
- lib/loose_tight_dictionary/blocking.rb
|
96
|
-
- lib/loose_tight_dictionary/extract_regexp.rb
|
97
128
|
- lib/loose_tight_dictionary/identity.rb
|
98
129
|
- lib/loose_tight_dictionary/result.rb
|
99
130
|
- lib/loose_tight_dictionary/score.rb
|
@@ -104,12 +135,10 @@ files:
|
|
104
135
|
- loose_tight_dictionary.gemspec
|
105
136
|
- test/helper.rb
|
106
137
|
- test/test_blocking.rb
|
107
|
-
- test/test_extract_regexp.rb
|
108
138
|
- test/test_identity.rb
|
109
139
|
- test/test_loose_tight_dictionary.rb
|
110
140
|
- test/test_loose_tight_dictionary_convoluted.rb.disabled
|
111
141
|
- test/test_tightening.rb
|
112
|
-
has_rdoc: true
|
113
142
|
homepage: https://github.com/seamusabshere/loose_tight_dictionary
|
114
143
|
licenses: []
|
115
144
|
|
@@ -123,24 +152,29 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
123
152
|
requirements:
|
124
153
|
- - ">="
|
125
154
|
- !ruby/object:Gem::Version
|
155
|
+
hash: 3
|
156
|
+
segments:
|
157
|
+
- 0
|
126
158
|
version: "0"
|
127
159
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
128
160
|
none: false
|
129
161
|
requirements:
|
130
162
|
- - ">="
|
131
163
|
- !ruby/object:Gem::Version
|
164
|
+
hash: 3
|
165
|
+
segments:
|
166
|
+
- 0
|
132
167
|
version: "0"
|
133
168
|
requirements: []
|
134
169
|
|
135
170
|
rubyforge_project: loose_tight_dictionary
|
136
|
-
rubygems_version: 1.
|
171
|
+
rubygems_version: 1.7.2
|
137
172
|
signing_key:
|
138
173
|
specification_version: 3
|
139
174
|
summary: Allows iterative development of dictionaries for big data sets.
|
140
175
|
test_files:
|
141
176
|
- test/helper.rb
|
142
177
|
- test/test_blocking.rb
|
143
|
-
- test/test_extract_regexp.rb
|
144
178
|
- test/test_identity.rb
|
145
179
|
- test/test_loose_tight_dictionary.rb
|
146
180
|
- test/test_loose_tight_dictionary_convoluted.rb.disabled
|
@@ -1,30 +0,0 @@
|
|
1
|
-
class LooseTightDictionary
|
2
|
-
module ExtractRegexp #:nodoc: all
|
3
|
-
def extract_regexp(regexp_or_str)
|
4
|
-
case regexp_or_str
|
5
|
-
when ::Regexp
|
6
|
-
regexp_or_str
|
7
|
-
when ::String
|
8
|
-
regexp_from_string regexp_or_str
|
9
|
-
else
|
10
|
-
raise ::ArgumentError, "Expected regexp or string"
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
REGEXP_DELIMITERS = {
|
15
|
-
'%r{' => '}',
|
16
|
-
'/' => '/'
|
17
|
-
}
|
18
|
-
def regexp_from_string(str)
|
19
|
-
delim_start, delim_end = REGEXP_DELIMITERS.detect { |k, v| str.start_with? k }.map { |delim| ::Regexp.escape delim }
|
20
|
-
%r{\A#{delim_start}(.*)#{delim_end}([^#{delim_end}]*)\z} =~ str.strip
|
21
|
-
content = $1
|
22
|
-
options = $2
|
23
|
-
content.gsub! '\\/', '/'
|
24
|
-
ignore_case = options.include?('i') ? ::Regexp::IGNORECASE : nil
|
25
|
-
multiline = options.include?('m') ? ::Regexp::MULTILINE : nil
|
26
|
-
extended = options.include?('x') ? ::Regexp::EXTENDED : nil
|
27
|
-
::Regexp.new content, (ignore_case||multiline||extended)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
data/test/test_extract_regexp.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
class TestExtractRegexp < Test::Unit::TestCase
|
4
|
-
def test_001_regexp
|
5
|
-
i = LooseTightDictionary::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
|
6
|
-
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
7
|
-
end
|
8
|
-
|
9
|
-
def test_002_regexp_from_string
|
10
|
-
i = LooseTightDictionary::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
|
11
|
-
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_003_regexp_from_string_using_slash_delim
|
15
|
-
i = LooseTightDictionary::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
|
16
|
-
assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
|
17
|
-
end
|
18
|
-
end
|