loose_tight_dictionary 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/benchmark/memory.rb CHANGED
@@ -22,7 +22,7 @@ HAYSTACK_READER = lambda { |record| "#{record['Manufacturer']} #{record['Long Na
22
22
 
23
23
  # Whether to even bother trying to find a match for something without an explicit block
24
24
  # (Example) False, which is the default, which means we have more work to do
25
- STRICT_BLOCKING = false
25
+ MUST_MATCH_BLOCKING = false
26
26
 
27
27
  # Blockings
28
28
  # (Example) We made these by trial and error
@@ -38,7 +38,7 @@ IDENTITIES = RemoteTable.new(:url => "file://#{File.expand_path("../../examples/
38
38
 
39
39
  FINAL_OPTIONS = {
40
40
  :haystack_reader => HAYSTACK_READER,
41
- :strict_blocking => STRICT_BLOCKING,
41
+ :must_match_blocking => MUST_MATCH_BLOCKING,
42
42
  :tighteners => TIGHTENERS,
43
43
  :identities => IDENTITIES,
44
44
  :blockings => BLOCKINGS
@@ -20,7 +20,7 @@ HAYSTACK_READER = lambda { |record| "#{record['Manufacturer']} #{record['Long Na
20
20
 
21
21
  # Whether to even bother trying to find a match for something without an explicit block
22
22
  # (Example) False, which is the default, which means we have more work to do
23
- STRICT_BLOCKING = false
23
+ MUST_MATCH_BLOCKING = false
24
24
 
25
25
  # Blockings
26
26
  # (Example) We made these by trial and error
@@ -64,7 +64,7 @@ NEGATIVES = RemoteTable.new :url => "file://#{File.expand_path("../negatives.csv
64
64
 
65
65
  FINAL_OPTIONS = {
66
66
  :haystack_reader => HAYSTACK_READER,
67
- :strict_blocking => STRICT_BLOCKING,
67
+ :must_match_blocking => MUST_MATCH_BLOCKING,
68
68
  :tighteners => TIGHTENERS,
69
69
  :identities => IDENTITIES,
70
70
  :blockings => BLOCKINGS
@@ -7,10 +7,10 @@ require 'active_support/version'
7
7
  }.each do |active_support_3_requirement|
8
8
  require active_support_3_requirement
9
9
  end if ::ActiveSupport::VERSION::MAJOR == 3
10
+ require 'to_regexp'
10
11
 
11
12
  # See the README for more information.
12
13
  class LooseTightDictionary
13
- autoload :ExtractRegexp, 'loose_tight_dictionary/extract_regexp'
14
14
  autoload :Tightener, 'loose_tight_dictionary/tightener'
15
15
  autoload :Blocking, 'loose_tight_dictionary/blocking'
16
16
  autoload :Identity, 'loose_tight_dictionary/identity'
@@ -48,7 +48,6 @@ class LooseTightDictionary
48
48
  find needle, options
49
49
  end
50
50
 
51
- # todo fix record.record confusion (should be wrapper.record or smth)
52
51
  def find(needle, options = {})
53
52
  raise Freed if freed?
54
53
  free_last_result
@@ -69,7 +68,7 @@ class LooseTightDictionary
69
68
  last_result.needle = needle
70
69
  end
71
70
 
72
- if strict_blocking and blockings.none? { |blocking| blocking.encompass? needle }
71
+ if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
73
72
  if find_all
74
73
  return []
75
74
  else
@@ -77,25 +76,31 @@ class LooseTightDictionary
77
76
  end
78
77
  end
79
78
 
80
- encompassed, unencompassed = if strict_blocking and blockings.any?
81
- haystack.partition do |record|
79
+ encompassed, unencompassed = if blockings.any?
80
+ haystack.partition do |straw|
82
81
  blockings.any? do |blocking|
83
- blocking.encompass?(needle, record) == true
82
+ blocking.encompass?(needle, straw) == true
84
83
  end
85
84
  end
86
85
  else
87
86
  [ haystack.dup, [] ]
88
87
  end
89
88
 
89
+ # special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
90
+ if encompassed.none?
91
+ encompassed = unencompassed
92
+ unencompassed = []
93
+ end
94
+
90
95
  if gather_last_result
91
96
  last_result.encompassed = encompassed
92
97
  last_result.unencompassed = unencompassed
93
98
  end
94
99
 
95
100
  possibly_identical, certainly_different = if identities.any?
96
- encompassed.partition do |record|
101
+ encompassed.partition do |straw|
97
102
  identities.all? do |identity|
98
- answer = identity.identical? needle, record
103
+ answer = identity.identical? needle, straw
99
104
  answer.nil? or answer == true
100
105
  end
101
106
  end
@@ -109,24 +114,24 @@ class LooseTightDictionary
109
114
  end
110
115
 
111
116
  if find_all
112
- return possibly_identical.map { |record| record.record }
117
+ return possibly_identical.map { |straw| straw.record }
113
118
  end
114
119
 
115
- similarities = possibly_identical.map do |record|
116
- needle.similarity record
120
+ similarities = possibly_identical.map do |straw|
121
+ needle.similarity straw
117
122
  end.sort
118
123
 
119
124
  best_similarity = similarities[-1]
120
- record = best_similarity.wrapper2
125
+ straw = best_similarity.wrapper2
121
126
  score = best_similarity.best_score.to_f
122
127
 
123
128
  if gather_last_result
124
129
  last_result.similarities = similarities
125
- last_result.record = record.record
130
+ last_result.record = straw.record
126
131
  last_result.score = score
127
132
  end
128
133
 
129
- record.record
134
+ straw.record
130
135
  end
131
136
 
132
137
  # Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
@@ -188,8 +193,8 @@ class LooseTightDictionary
188
193
  options[:haystack_reader]
189
194
  end
190
195
 
191
- def strict_blocking
192
- options[:strict_blocking] || false
196
+ def must_match_blocking
197
+ options[:must_match_blocking] || false
193
198
  end
194
199
 
195
200
  def tighteners
@@ -7,22 +7,22 @@ class LooseTightDictionary
7
7
  # A blocking (as in a grouping) comes into effect when a str matches.
8
8
  # Then the needle must also match the blocking's regexp.
9
9
  class Blocking
10
- include ExtractRegexp
11
-
12
10
  attr_reader :regexp
13
11
 
14
12
  def initialize(regexp_or_str)
15
- @regexp = extract_regexp regexp_or_str
13
+ @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
14
+ end
15
+
16
+ def match?(str)
17
+ !!(regexp.match(str))
16
18
  end
17
19
 
18
20
  # If a blocking "encompasses" two strings, that means they both fit into it.
19
21
  #
20
22
  # Returns false if they certainly don't fit this blocking.
21
23
  # Returns nil if the blocking doesn't apply, i.e. str2 doesn't fit the blocking.
22
- def encompass?(str1, str2 = nil)
23
- if str2.nil?
24
- !!(regexp.match(str1))
25
- elsif str2_match_data = regexp.match(str2)
24
+ def encompass?(str1, str2)
25
+ if str2_match_data = regexp.match(str2)
26
26
  if str1_match_data = regexp.match(str1)
27
27
  str2_match_data.captures == str1_match_data.captures
28
28
  else
@@ -2,12 +2,10 @@ class LooseTightDictionary
2
2
  # Identities take effect when needle and haystack both match a regexp
3
3
  # Then the captured part of the regexp has to match exactly
4
4
  class Identity
5
- include ExtractRegexp
6
-
7
5
  attr_reader :regexp
8
6
 
9
7
  def initialize(regexp_or_str)
10
- @regexp = extract_regexp regexp_or_str
8
+ @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
11
9
  end
12
10
 
13
11
  # Two strings are "identical" if they both match this identity and the captures are equal.
@@ -1,12 +1,10 @@
1
1
  class LooseTightDictionary
2
2
  # A tightener just strips a string down to its core
3
3
  class Tightener
4
- include ExtractRegexp
5
-
6
4
  attr_reader :regexp
7
5
 
8
6
  def initialize(regexp_or_str)
9
- @regexp = extract_regexp regexp_or_str
7
+ @regexp = regexp_or_str.is_a?(::Regexp) ? regexp_or_str : regexp_or_str.to_regexp
10
8
  end
11
9
 
12
10
  # A tightener applies when its regexp matches and captures a new (shorter) string
@@ -1,3 +1,3 @@
1
1
  class LooseTightDictionary
2
- VERSION = '0.1.1'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -23,4 +23,5 @@ Gem::Specification.new do |s|
23
23
  s.add_development_dependency "remote_table"
24
24
  s.add_dependency 'activesupport', '>=2.3.4'
25
25
  s.add_dependency 'amatch'
26
+ s.add_dependency 'to_regexp'
26
27
  end
@@ -1,9 +1,9 @@
1
1
  require 'helper'
2
2
 
3
3
  class TestBlocking < Test::Unit::TestCase
4
- def test_001_encompass_one
4
+ def test_001_match_one
5
5
  b = LooseTightDictionary::Blocking.new %r{apple}
6
- assert_equal true, b.encompass?('2 apples')
6
+ assert_equal true, b.match?('2 apples')
7
7
  end
8
8
 
9
9
  def test_002_encompass_both
@@ -15,4 +15,19 @@ class TestIdentity < Test::Unit::TestCase
15
15
  i = LooseTightDictionary::Identity.new %r{(A)[ ]*(\d)}
16
16
  assert_equal nil, i.identical?('B1', 'A 2foobar')
17
17
  end
18
+
19
+ def test_004_regexp
20
+ i = LooseTightDictionary::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
21
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
22
+ end
23
+
24
+ def test_005_regexp_from_string
25
+ i = LooseTightDictionary::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
26
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
27
+ end
28
+
29
+ def test_006_regexp_from_string_using_slash_delim
30
+ i = LooseTightDictionary::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
31
+ assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
32
+ end
18
33
  end
@@ -51,24 +51,20 @@ class TestLooseTightDictionary < Test::Unit::TestCase
51
51
  assert_equal 'foo', d.find('baz')
52
52
  end
53
53
 
54
- def test_009_loose_blocking
55
- # sanity check
54
+ def test_009_must_match_blocking
56
55
  d = LooseTightDictionary.new [ 'X' ]
57
56
  assert_equal 'X', d.find('X')
58
57
  assert_equal 'X', d.find('A')
59
- # end sanity check
60
58
 
61
59
  d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ]
62
60
  assert_equal 'X', d.find('X')
63
61
  assert_equal 'X', d.find('A')
64
- end
65
-
66
- def test_010_strict_blocking
67
- d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :strict_blocking => true
62
+
63
+ d = LooseTightDictionary.new [ 'X' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
68
64
  assert_equal 'X', d.find('X')
69
65
  assert_equal nil, d.find('A')
70
66
  end
71
-
67
+
72
68
  def test_011_free
73
69
  d = LooseTightDictionary.new %w{ NISSAN HONDA }
74
70
  d.free
@@ -78,7 +74,7 @@ class TestLooseTightDictionary < Test::Unit::TestCase
78
74
  end
79
75
 
80
76
  def test_012_find_all
81
- d = LooseTightDictionary.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :strict_blocking => true
77
+ d = LooseTightDictionary.new [ 'X', 'X22', 'Y', 'Y4' ], :blockings => [ /X/, /Y/ ], :must_match_blocking => true
82
78
  assert_equal ['X', 'X22' ], d.find_all('X')
83
79
  assert_equal [], d.find_all('A')
84
80
  end
metadata CHANGED
@@ -1,8 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: loose_tight_dictionary
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 23
4
5
  prerelease:
5
- version: 0.1.1
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
6
11
  platform: ruby
7
12
  authors:
8
13
  - Seamus Abshere
@@ -10,8 +15,7 @@ autorequire:
10
15
  bindir: bin
11
16
  cert_chain: []
12
17
 
13
- date: 2011-04-18 00:00:00 -05:00
14
- default_executable:
18
+ date: 2011-04-27 00:00:00 Z
15
19
  dependencies:
16
20
  - !ruby/object:Gem::Dependency
17
21
  name: shoulda
@@ -21,6 +25,9 @@ dependencies:
21
25
  requirements:
22
26
  - - ">="
23
27
  - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
24
31
  version: "0"
25
32
  type: :development
26
33
  version_requirements: *id001
@@ -32,6 +39,9 @@ dependencies:
32
39
  requirements:
33
40
  - - ">="
34
41
  - !ruby/object:Gem::Version
42
+ hash: 3
43
+ segments:
44
+ - 0
35
45
  version: "0"
36
46
  type: :development
37
47
  version_requirements: *id002
@@ -43,6 +53,11 @@ dependencies:
43
53
  requirements:
44
54
  - - ">="
45
55
  - !ruby/object:Gem::Version
56
+ hash: 11
57
+ segments:
58
+ - 2
59
+ - 3
60
+ - 4
46
61
  version: 2.3.4
47
62
  type: :runtime
48
63
  version_requirements: *id003
@@ -54,9 +69,26 @@ dependencies:
54
69
  requirements:
55
70
  - - ">="
56
71
  - !ruby/object:Gem::Version
72
+ hash: 3
73
+ segments:
74
+ - 0
57
75
  version: "0"
58
76
  type: :runtime
59
77
  version_requirements: *id004
78
+ - !ruby/object:Gem::Dependency
79
+ name: to_regexp
80
+ prerelease: false
81
+ requirement: &id005 !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ hash: 3
87
+ segments:
88
+ - 0
89
+ version: "0"
90
+ type: :runtime
91
+ version_requirements: *id005
60
92
  description: Create dictionaries that link rows between two tables using loose matching (string similarity) by default and tight matching (regexp) by request.
61
93
  email:
62
94
  - seamus@abshere.net
@@ -93,7 +125,6 @@ files:
93
125
  - examples/icao-bts.xls
94
126
  - lib/loose_tight_dictionary.rb
95
127
  - lib/loose_tight_dictionary/blocking.rb
96
- - lib/loose_tight_dictionary/extract_regexp.rb
97
128
  - lib/loose_tight_dictionary/identity.rb
98
129
  - lib/loose_tight_dictionary/result.rb
99
130
  - lib/loose_tight_dictionary/score.rb
@@ -104,12 +135,10 @@ files:
104
135
  - loose_tight_dictionary.gemspec
105
136
  - test/helper.rb
106
137
  - test/test_blocking.rb
107
- - test/test_extract_regexp.rb
108
138
  - test/test_identity.rb
109
139
  - test/test_loose_tight_dictionary.rb
110
140
  - test/test_loose_tight_dictionary_convoluted.rb.disabled
111
141
  - test/test_tightening.rb
112
- has_rdoc: true
113
142
  homepage: https://github.com/seamusabshere/loose_tight_dictionary
114
143
  licenses: []
115
144
 
@@ -123,24 +152,29 @@ required_ruby_version: !ruby/object:Gem::Requirement
123
152
  requirements:
124
153
  - - ">="
125
154
  - !ruby/object:Gem::Version
155
+ hash: 3
156
+ segments:
157
+ - 0
126
158
  version: "0"
127
159
  required_rubygems_version: !ruby/object:Gem::Requirement
128
160
  none: false
129
161
  requirements:
130
162
  - - ">="
131
163
  - !ruby/object:Gem::Version
164
+ hash: 3
165
+ segments:
166
+ - 0
132
167
  version: "0"
133
168
  requirements: []
134
169
 
135
170
  rubyforge_project: loose_tight_dictionary
136
- rubygems_version: 1.6.2
171
+ rubygems_version: 1.7.2
137
172
  signing_key:
138
173
  specification_version: 3
139
174
  summary: Allows iterative development of dictionaries for big data sets.
140
175
  test_files:
141
176
  - test/helper.rb
142
177
  - test/test_blocking.rb
143
- - test/test_extract_regexp.rb
144
178
  - test/test_identity.rb
145
179
  - test/test_loose_tight_dictionary.rb
146
180
  - test/test_loose_tight_dictionary_convoluted.rb.disabled
@@ -1,30 +0,0 @@
1
- class LooseTightDictionary
2
- module ExtractRegexp #:nodoc: all
3
- def extract_regexp(regexp_or_str)
4
- case regexp_or_str
5
- when ::Regexp
6
- regexp_or_str
7
- when ::String
8
- regexp_from_string regexp_or_str
9
- else
10
- raise ::ArgumentError, "Expected regexp or string"
11
- end
12
- end
13
-
14
- REGEXP_DELIMITERS = {
15
- '%r{' => '}',
16
- '/' => '/'
17
- }
18
- def regexp_from_string(str)
19
- delim_start, delim_end = REGEXP_DELIMITERS.detect { |k, v| str.start_with? k }.map { |delim| ::Regexp.escape delim }
20
- %r{\A#{delim_start}(.*)#{delim_end}([^#{delim_end}]*)\z} =~ str.strip
21
- content = $1
22
- options = $2
23
- content.gsub! '\\/', '/'
24
- ignore_case = options.include?('i') ? ::Regexp::IGNORECASE : nil
25
- multiline = options.include?('m') ? ::Regexp::MULTILINE : nil
26
- extended = options.include?('x') ? ::Regexp::EXTENDED : nil
27
- ::Regexp.new content, (ignore_case||multiline||extended)
28
- end
29
- end
30
- end
@@ -1,18 +0,0 @@
1
- require 'helper'
2
-
3
- class TestExtractRegexp < Test::Unit::TestCase
4
- def test_001_regexp
5
- i = LooseTightDictionary::Identity.new %r{\A\\?/(.*)etc/mysql\$$}
6
- assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
7
- end
8
-
9
- def test_002_regexp_from_string
10
- i = LooseTightDictionary::Identity.new '%r{\A\\\?/(.*)etc/mysql\$$}'
11
- assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
12
- end
13
-
14
- def test_003_regexp_from_string_using_slash_delim
15
- i = LooseTightDictionary::Identity.new '/\A\\\?\/(.*)etc\/mysql\$$/'
16
- assert_equal %r{\A\\?/(.*)etc/mysql\$$}, i.regexp
17
- end
18
- end