loose_tight_dictionary 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/benchmark/memory.rb CHANGED
@@ -22,7 +22,7 @@ HAYSTACK_READER = lambda { |record| "#{record['Manufacturer']} #{record['Long Na
22
22
 
23
23
  # Whether to even bother trying to find a match for something without an explicit block
24
24
  # (Example) False, which is the default, which means we have more work to do
25
- STRICT_BLOCKING = false
25
+ MUST_MATCH_BLOCKING = false
26
26
 
27
27
  # Blockings
28
28
  # (Example) We made these by trial and error
@@ -38,7 +38,7 @@ IDENTITIES = RemoteTable.new(:url => "file://#{File.expand_path("../../examples/
38
38
 
39
39
  FINAL_OPTIONS = {
40
40
  :haystack_reader => HAYSTACK_READER,
41
- :strict_blocking => STRICT_BLOCKING,
41
+ :must_match_blocking => MUST_MATCH_BLOCKING,
42
42
  :tighteners => TIGHTENERS,
43
43
  :identities => IDENTITIES,
44
44
  :blockings => BLOCKINGS
@@ -20,7 +20,7 @@ HAYSTACK_READER = lambda { |record| "#{record['Manufacturer']} #{record['Long Na
20
20
 
21
21
  # Whether to even bother trying to find a match for something without an explicit block
22
22
  # (Example) False, which is the default, which means we have more work to do
23
- STRICT_BLOCKING = false
23
+ MUST_MATCH_BLOCKING = false
24
24
 
25
25
  # Blockings
26
26
  # (Example) We made these by trial and error
@@ -64,7 +64,7 @@ NEGATIVES = RemoteTable.new :url => "file://#{File.expand_path("../negatives.csv
64
64
 
65
65
  FINAL_OPTIONS = {
66
66
  :haystack_reader => HAYSTACK_READER,
67
- :strict_blocking => STRICT_BLOCKING,
67
+ :must_match_blocking => MUST_MATCH_BLOCKING,
68
68
  :tighteners => TIGHTENERS,
69
69
  :identities => IDENTITIES,
70
70
  :blockings => BLOCKINGS
@@ -7,10 +7,10 @@ require 'active_support/version'
7
7
  }.each do |active_support_3_requirement|
8
8
  require active_support_3_requirement
9
9
  end if ::ActiveSupport::VERSION::MAJOR == 3
10
+ require 'to_regexp'
10
11
 
11
12
  # See the README for more information.
12
13
  class LooseTightDictionary
13
- autoload :ExtractRegexp, 'loose_tight_dictionary/extract_regexp'
14
14
  autoload :Tightener, 'loose_tight_dictionary/tightener'
15
15
  autoload :Blocking, 'loose_tight_dictionary/blocking'
16
16
  autoload :Identity, 'loose_tight_dictionary/identity'
@@ -48,7 +48,6 @@ class LooseTightDictionary
48
48
  find needle, options
49
49
  end
50
50
 
51
- # todo fix record.record confusion (should be wrapper.record or smth)
52
51
  def find(needle, options = {})
53
52
  raise Freed if freed?
54
53
  free_last_result
@@ -69,7 +68,7 @@ class LooseTightDictionary
69
68
  last_result.needle = needle
70
69
  end
71
70
 
72
- if strict_blocking and blockings.none? { |blocking| blocking.encompass? needle }
71
+ if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
73
72
  if find_all
74
73
  return []
75
74
  else
@@ -77,25 +76,31 @@ class LooseTightDictionary
77
76
  end
78
77
  end
79
78
 
80
- encompassed, unencompassed = if strict_blocking and blockings.any?
81
- haystack.partition do |record|
79
+ encompassed, unencompassed = if blockings.any?
80
+ haystack.partition do |straw|
82
81
  blockings.any? do |blocking|
83
- blocking.encompass?(needle, record) == true
82
+ blocking.encompass?(needle, straw) == true
84
83
  end
85
84
  end
86
85
  else
87
86
  [ haystack.dup, [] ]
88
87
  end
89
88
 
89
+ # special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
90
+ if encompassed.none?
91
+ encompassed = unencompassed
92
+ unencompassed = []
93
+ end
94
+
90
95
  if gather_last_result
91
96
  last_result.encompassed = encompassed
92
97
  last_result.unencompassed = unencompassed
93
98
  end
94
99
 
95
100
  possibly_identical, certainly_different = if identities.any?
96
- encompassed.partition do |record|
101
+ encompassed.partition do |straw|
97
102
  identities.all? do |identity|
98
- answer = identity.identical? needle, record
103
+ answer = identity.identical? needle, straw
99
104
  answer.nil? or answer == true
100
105
  end
101
106
  end
@@ -109,24 +114,24 @@ class LooseTightDictionary
109
114
  end
110
115
 
111
116
  if find_all
112
- return possibly_identical.map { |record| record.record }
117
+ return possibly_identical.map { |straw| straw.record }
113
118
  end
114
119
 
115
- similarities = possibly_identical.map do |record|
116
- needle.similarity record
120
+ similarities = possibly_identical.map do |straw|
121
+ needle.similarity straw
117
122
  end.sort
118
123
 
119
124
  best_similarity = similarities[-1]
120
- record = best_similarity.wrapper2
125
+ straw = best_similarity.wrapper2
121
126
  score = best_similarity.best_score.to_f
122
127
 
123
128
  if gather_last_result
124
129
  last_result.similarities = similarities
125
- last_result.record = record.record
130
+ last_result.record = straw.record
126
131
  last_result.score = score
127
132
  end
128
133
 
129
- record.record
134
+ straw.record
130
135
  end
131
136
 
132
137
  # Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
@@ -188,8 +193,8 @@ class LooseTightDictionary
188
193
  options[:haystack_reader]
189
194
  end
190
195
 
191
- def strict_blocking
192
- options[:strict_blocking] || false
196
+ def must_match_blocking
197
+ options[:must_match_blocking] || false
193
198
  end
194
199
 
195
200
  def tighteners
@@ -7,22 +7,22 @@ class LooseTightDictionary
7
7
  # A blocking (as in a grouping) comes into effect when a str matches.
8
8
  # Then the needle must also match the blocking's regexp.
9
9
  class Blocking
10
- include ExtractRegexp
11
-
12
10
  attr_reader :regexp
13
11
 
14
12
  def initialize(regexp_or_str)
15
- @regexp = extract_regexp regexp_or_str
13
+ @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
14
+ end
15
+
16
+ def match?(str)
17
+ !!(regexp.match(str))
16
18
  end
17
19
 
18
20
  # If a blocking "encompasses" two strings, that means they both fit into it.
19
21
  #
20
22
  # Returns false if they certainly don't fit this blocking.
21
23
  # Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
22
- def encompass?(str1, str2 = nil)
23
- if str2.nil?
24
- !!(regexp.match(str1))
25
- elsif str2_match_data = regexp.match(str2)
24
+ def encompass?(str1, str2)
25
+ if str2_match_data = regexp.match(str2)
26
26
  if str1_match_data = regexp.match(str1)
27
27
  str2_match_data.captures == str1_match_data.captures
28
28
  else
@@ -2,12 +2,10 @@ class LooseTightDictionary
2
2
  # Identities take effect when needle and haystack both match a regexp
3
3
  # Then the captured part of the regexp has to match exactly
4
4
  class Identity
5
- include ExtractRegexp
6
-
7
5
  attr_reader :regexp
8
6
 
9
7
  def initialize(regexp_or_str)
10
- @regexp = extract_regexp regexp_or_str
8
+ @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
11
9
  end
12
10
 
13
11
  # Two strings are "identical" if they both match this identity and the captures are equal.
@@ -1,12 +1,10 @@
1
1
  class LooseTightDictionary
2
2
  # A tightener just strips a string down to its core
3
3
  class Tightener
4
- include ExtractRegexp
5
-
6
4
  attr_reader :regexp
7
5
 
8
6
  def initialize(regexp_or_str)
9
- @regexp = extract_regexp regexp_or_str
7
+ @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
10
8
  end
11
9
 
12
10
  # A tightener applies when its regexp matches and captures a new (shorter) string
@@ -1,3 +1,3 @@
1
1
  class LooseTightDictionary
2
- VERSION = '0.1.1'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -23,4 +23,5 @@ Gem::Specification.new do |s|
23
23
  s.add_development_dependency "remote_table"
24
24
  s.add_dependency 'activesupport', '>=2.3.4'
25
25
  s.add_dependency 'amatch'
26
+ s.add_dependency 'to_regexp'
26
27
  end
@@ -1,9 +1,9 @@
1
1
  require 'helper'
2
2
 
3
3
  class TestBlocking < Test::Unit::TestCase
4
- def test_001_encompass_one
4
+ def test_001_match_one
5
5
  b = LooseTightDictionary::Blocking.new %r{apple}
6
- assert_equal true, b.encompass?('2 apples')
6
+ assert_equal true, b.match?('2 apples')
7
7
  end
8
8
 
9
9
  def test_002_encompass_both
@@ -15,4 +15,19 @@ class TestIdentity < Test::Unit::TestCase
15
15
  i = LooseTightDictionary::Identity.new %r{(A)[ ]*(\d)}
16
16
  assert_equal nil, i.identical?('B1', 'A 2foobar')
17
17
  end
18
+
19
+ def test_004_regexp
20
+ i = LooseTightDictionary::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
21
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
22
+ end
23
+
24
+ def test_005_regexp_from_string
25
+ i = LooseTightDictionary::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
26
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
27
+ end
28
+
29
+ def test_006_regexp_from_string_using_slash_delim
30
+ i = LooseTightDictionary::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
31
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
32
+ end
18
33
  end
@@ -51,24 +51,20 @@ class TestLooseTightDictionary < Test::Unit::TestCase
51
51
  assert_equal 'foo', d.find('baz')
52
52
  end
53
53
 
54
- def test_009_loose_blocking
55
- # sanity check
54
+ def test_009_must_match_blocking
56
55
  d = LooseTightDictionary.new [ 'X' ]
57
56
  assert_equal 'X', d.find('X')
58
57
  assert_equal 'X', d.find('A')
59
- # end sanity check
60
58
 
61
59
  d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ]
62
60
  assert_equal 'X', d.find('X')
63
61
  assert_equal 'X', d.find('A')
64
- end
65
-
66
- def test_010_strict_blocking
67
- d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :strict_blocking => true
62
+
63
+ d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
68
64
  assert_equal 'X', d.find('X')
69
65
  assert_equal nil, d.find('A')
70
66
  end
71
-
67
+
72
68
  def test_011_free
73
69
  d = LooseTightDictionary.new %w{ NISSAN HONDA }
74
70
  d.free
@@ -78,7 +74,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
78
74
  end
79
75
 
80
76
  def test_012_find_all
81
- d = LooseTightDictionary.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :strict_blocking => true
77
+ d = LooseTightDictionary.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
82
78
  assert_equal ['X', 'X22' ], d.find_all('X')
83
79
  assert_equal [], d.find_all('A')
84
80
  end
metadata CHANGED
@@ -1,8 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: loose_tight_dictionary
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 23
4
5
  prerelease:
5
- version: 0.1.1
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
6
11
  platform: ruby
7
12
  authors:
8
13
  - Seamus Abshere
@@ -10,8 +15,7 @@ autorequire:
10
15
  bindir: bin
11
16
  cert_chain: []
12
17
 
13
- date: 2011-04-18 00:00:00 -05:00
14
- default_executable:
18
+ date: 2011-04-27 00:00:00 Z
15
19
  dependencies:
16
20
  - !ruby/object:Gem::Dependency
17
21
  name: shoulda
@@ -21,6 +25,9 @@ dependencies:
21
25
  requirements:
22
26
  - - ">="
23
27
  - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
24
31
  version: "0"
25
32
  type: :development
26
33
  version_requirements: *id001
@@ -32,6 +39,9 @@ dependencies:
32
39
  requirements:
33
40
  - - ">="
34
41
  - !ruby/object:Gem::Version
42
+ hash: 3
43
+ segments:
44
+ - 0
35
45
  version: "0"
36
46
  type: :development
37
47
  version_requirements: *id002
@@ -43,6 +53,11 @@ dependencies:
43
53
  requirements:
44
54
  - - ">="
45
55
  - !ruby/object:Gem::Version
56
+ hash: 11
57
+ segments:
58
+ - 2
59
+ - 3
60
+ - 4
46
61
  version: 2.3.4
47
62
  type: :runtime
48
63
  version_requirements: *id003
@@ -54,9 +69,26 @@ dependencies:
54
69
  requirements:
55
70
  - - ">="
56
71
  - !ruby/object:Gem::Version
72
+ hash: 3
73
+ segments:
74
+ - 0
57
75
  version: "0"
58
76
  type: :runtime
59
77
  version_requirements: *id004
78
+ - !ruby/object:Gem::Dependency
79
+ name: to_regexp
80
+ prerelease: false
81
+ requirement: &id005 !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ hash: 3
87
+ segments:
88
+ - 0
89
+ version: "0"
90
+ type: :runtime
91
+ version_requirements: *id005
60
92
  description: Create dictionaries that link rows between two tables using loose matching (string similarity) by default and tight matching (regexp) by request.
61
93
  email:
62
94
  - seamus@abshere.net
@@ -93,7 +125,6 @@ files:
93
125
  - examples/icao-bts.xls
94
126
  - lib/loose_tight_dictionary.rb
95
127
  - lib/loose_tight_dictionary/blocking.rb
96
- - lib/loose_tight_dictionary/extract_regexp.rb
97
128
  - lib/loose_tight_dictionary/identity.rb
98
129
  - lib/loose_tight_dictionary/result.rb
99
130
  - lib/loose_tight_dictionary/score.rb
@@ -104,12 +135,10 @@ files:
104
135
  - loose_tight_dictionary.gemspec
105
136
  - test/helper.rb
106
137
  - test/test_blocking.rb
107
- - test/test_extract_regexp.rb
108
138
  - test/test_identity.rb
109
139
  - test/test_loose_tight_dictionary.rb
110
140
  - test/test_loose_tight_dictionary_convoluted.rb.disabled
111
141
  - test/test_tightening.rb
112
- has_rdoc: true
113
142
  homepage: https://github.com/seamusabshere/loose_tight_dictionary
114
143
  licenses: []
115
144
 
@@ -123,24 +152,29 @@ required_ruby_version: !ruby/object:Gem::Requirement
123
152
  requirements:
124
153
  - - ">="
125
154
  - !ruby/object:Gem::Version
155
+ hash: 3
156
+ segments:
157
+ - 0
126
158
  version: "0"
127
159
  required_rubygems_version: !ruby/object:Gem::Requirement
128
160
  none: false
129
161
  requirements:
130
162
  - - ">="
131
163
  - !ruby/object:Gem::Version
164
+ hash: 3
165
+ segments:
166
+ - 0
132
167
  version: "0"
133
168
  requirements: []
134
169
 
135
170
  rubyforge_project: loose_tight_dictionary
136
- rubygems_version: 1.6.2
171
+ rubygems_version: 1.7.2
137
172
  signing_key:
138
173
  specification_version: 3
139
174
  summary: Allows iterative development of dictionaries for big data sets.
140
175
  test_files:
141
176
  - test/helper.rb
142
177
  - test/test_blocking.rb
143
- - test/test_extract_regexp.rb
144
178
  - test/test_identity.rb
145
179
  - test/test_loose_tight_dictionary.rb
146
180
  - test/test_loose_tight_dictionary_convoluted.rb.disabled
@@ -1,30 +0,0 @@
1
- class LooseTightDictionary
2
- module ExtractRegexp #:nodoc: all
3
- def extract_regexp(regexp_or_str)
4
- case regexp_or_str
5
- when ::Regexp
6
- regexp_or_str
7
- when ::String
8
- regexp_from_string regexp_or_str
9
- else
10
- raise ::ArgumentError, "Expected regexp or string"
11
- end
12
- end
13
-
14
- REGEXP_DELIMITERS = {
15
- '%r{' => '}',
16
- '/' => '/'
17
- }
18
- def regexp_from_string(str)
19
- delim_start, delim_end = REGEXP_DELIMITERS.detect { |k, v| str.start_with? k }.map { |delim| ::Regexp.escape delim }
20
- %r{\A#{delim_start}(.*)#{delim_end}([^#{delim_end}]*)\z} =~ str.strip
21
- content = $1
22
- options = $2
23
- content.gsub! '\\/', '/'
24
- ignore_case = options.include?('i') ? ::Regexp::IGNORECASE : nil
25
- multiline = options.include?('m') ? ::Regexp::MULTILINE : nil
26
- extended = options.include?('x') ? ::Regexp::EXTENDED : nil
27
- ::Regexp.new content, (ignore_case||multiline||extended)
28
- end
29
- end
30
- end
@@ -1,18 +0,0 @@
1
- require 'helper'
2
-
3
- class TestExtractRegexp < Test::Unit::TestCase
4
- def test_001_regexp
5
- i = LooseTightDictionary::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
6
- assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
7
- end
8
-
9
- def test_002_regexp_from_string
10
- i = LooseTightDictionary::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
11
- assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
12
- end
13
-
14
- def test_003_regexp_from_string_using_slash_delim
15
- i = LooseTightDictionary::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
16
- assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
17
- end
18
- end