fuzzy-matcher 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/fuzzy_matcher/fuzzy_date.rb +119 -0
- data/lib/fuzzy_matcher/fuzzy_date_scanner.rb +27 -0
- data/lib/fuzzy_matcher/fuzzy_price.rb +83 -0
- data/lib/fuzzy_matcher/fuzzy_scanner.rb +14 -0
- data/lib/fuzzy_matcher/fuzzy_sub.rb +33 -0
- data/lib/fuzzy_matcher/fuzzy_words.rb +37 -0
- data/lib/fuzzy_matcher/version.rb +5 -0
- data/lib/fuzzy_matcher.rb +6 -0
- data/spec/fuzzy_matcher/fuzzy_date_scanner_spec.rb +99 -0
- data/spec/fuzzy_matcher/fuzzy_date_spec.rb +77 -0
- data/spec/fuzzy_matcher/fuzzy_price_spec.rb +62 -0
- data/spec/fuzzy_matcher/fuzzy_scanner_spec.rb +44 -0
- data/spec/fuzzy_matcher/fuzzy_sub_spec.rb +25 -0
- data/spec/fuzzy_matcher/fuzzy_words_spec.rb +47 -0
- data/spec/minitest_helper.rb +4 -0
- metadata +70 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'date'
|
2
|
+
|
3
|
+
class FuzzyDate
|
4
|
+
|
5
|
+
FORMAT_2_REGEX = {
|
6
|
+
# 03/18/2014 or 3/18/2014
|
7
|
+
"%m/%d/%Y" => /(0?[1-9]|1[012])[-\/.~X](0?[1-9]|[12][0-9]|3[01])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/,
|
8
|
+
|
9
|
+
# 18/03/2014
|
10
|
+
"%d/%m/%Y" => /(0[1-9]|[12][0-9]|3[01])[-\/.~X](0[1-9]|1[012])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/,
|
11
|
+
|
12
|
+
# 2014-04-14
|
13
|
+
"%Y-%m-%d" => /(20[0-9][0-9])[-.~](0[1-9]|1[012])[-.~](0[1-9]|[12][0-9]|3[01])/,
|
14
|
+
|
15
|
+
# 10APR2014 or 4APR2014
|
16
|
+
"%d%b%Y" => /(0?[1-9]|[12][0-9]|3[01])(JAN|FEB|MAR|APR|ApR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(19|20[0-9][0-9])/,
|
17
|
+
|
18
|
+
# April 7, 2014
|
19
|
+
"%B %d, %Y" => /(January|February|March|April|May|June|July|August|September|October|November|December) *(0?[1-9]|[12][0-9]|3[01])[,.]? *(19|20[0-9][0-9])/
|
20
|
+
}
|
21
|
+
|
22
|
+
attr_accessor :max_fuzz
|
23
|
+
|
24
|
+
def initialize(max_fuzz=2)
|
25
|
+
@fsub = FuzzySub.new FuzzySub::CHAR_2_NUM_SUB
|
26
|
+
@max_fuzz = max_fuzz
|
27
|
+
@scanners = []
|
28
|
+
FORMAT_2_REGEX.each do |key, value|
|
29
|
+
register key, value
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def register(format, regex)
|
34
|
+
@scanners << FuzzyDateScanner.new(format, regex)
|
35
|
+
end
|
36
|
+
|
37
|
+
# allow fuzziness of 2 by default
|
38
|
+
def fscan(string, fuzziness=2)
|
39
|
+
@scanners.map do |fdscan|
|
40
|
+
matches = fdscan.fscan!(string, fuzziness)
|
41
|
+
[matches, fdscan.format] if !matches.empty?
|
42
|
+
end.compact
|
43
|
+
end
|
44
|
+
|
45
|
+
def validaterize(m, format)
|
46
|
+
str = m[0]
|
47
|
+
|
48
|
+
case format
|
49
|
+
when "%Y-%m-%d"
|
50
|
+
date = @fsub.fsub!(m[3])
|
51
|
+
mont = @fsub.fsub!(m[2])
|
52
|
+
year = @fsub.fsub!(m[1])
|
53
|
+
str = "#{year}-#{mont}-#{date}"
|
54
|
+
when "%m/%d/%Y"
|
55
|
+
date = @fsub.fsub!(m[2])
|
56
|
+
mont = @fsub.fsub!(m[1])
|
57
|
+
year = @fsub.fsub!(m[3])
|
58
|
+
format = "%m/%d/%y" if year.length < 4
|
59
|
+
str = "#{mont}/#{date}/#{year}"
|
60
|
+
when "%d/%m/%Y"
|
61
|
+
date = @fsub.fsub!(m[1])
|
62
|
+
mont = @fsub.fsub!(m[2])
|
63
|
+
year = @fsub.fsub!(m[3])
|
64
|
+
format = "%d/%m/%y" if year.length < 4
|
65
|
+
str = "#{date}/#{mont}/#{year}"
|
66
|
+
when "%d%b%Y"
|
67
|
+
date = @fsub.fsub!(m[1])
|
68
|
+
mont = m[2].upcase
|
69
|
+
year = @fsub.fsub!(m[3])
|
70
|
+
format = "%d%b%y" if year.length < 4
|
71
|
+
str = "#{date}#{mont}#{year}"
|
72
|
+
end
|
73
|
+
|
74
|
+
[str.strip, format]
|
75
|
+
end
|
76
|
+
|
77
|
+
def matches_to_dates(matches)
|
78
|
+
dates = []
|
79
|
+
matches.each do |m|
|
80
|
+
# p "#{self.class.to_s} match: #{m[0]} with format #{m[1]}"
|
81
|
+
strings = m[0]
|
82
|
+
format = m[1]
|
83
|
+
|
84
|
+
strings.each do |str|
|
85
|
+
k = validaterize str, format
|
86
|
+
# p "#{k[0]}, #{k[1]}"
|
87
|
+
begin
|
88
|
+
date = Date.strptime(k[0], k[1])
|
89
|
+
dates << date
|
90
|
+
rescue ArgumentError
|
91
|
+
# p "String #{k[0]} is not valide date for date format #{k[1]}"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
dates
|
96
|
+
end
|
97
|
+
|
98
|
+
# iteratively find the dates, try fuzziness 1 and then 2
|
99
|
+
def to_date(string)
|
100
|
+
dates = []
|
101
|
+
fuzz = 1
|
102
|
+
while fuzz <= @max_fuzz do
|
103
|
+
matches = fscan string, fuzz
|
104
|
+
dates = matches_to_dates matches
|
105
|
+
break if !dates.empty?
|
106
|
+
fuzz = fuzz + 1
|
107
|
+
end
|
108
|
+
dates
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
module FuzzyDateString
|
114
|
+
|
115
|
+
def to_date(max_fuzz=2)
|
116
|
+
FuzzyDate.new(max_fuzz).to_date self
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# {
|
2
|
+
# # 03/18/2014 or 3/18/2014
|
3
|
+
# "%m/%d/%Y" => /([0 ][1-9]|1[012])[-\/.~X](0[1-9]|[12][0-9]|3[01])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/,
|
4
|
+
#
|
5
|
+
# # 18/03/2014
|
6
|
+
# "%d/%m/%Y" => /(0[1-9]|[12][0-9]|3[01])[-\/.~X](0[1-9]|1[012])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/,
|
7
|
+
#
|
8
|
+
# # 2014-04-14
|
9
|
+
# "%Y-%m-%d" => /(20[0-9][0-9])[-.~](0[1-9]|1[012])[-.~](0[1-9]|[12][0-9]|3[01])/,
|
10
|
+
#
|
11
|
+
# # 10APR2014 or 4APR2014
|
12
|
+
# "%d%b%Y" => /([0 ][1-9]|[12][0-9]|3[01])(JAN|FEB|MAR|APR|ApR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(19|20[0-9][0-9])/,
|
13
|
+
#
|
14
|
+
# # April 7, 2014
|
15
|
+
# "%B %d, %Y" => /(January|February|March|April|May|June|July|August|September|October|November|December) *(0?[1-9]|[12][0-9]|3[01])[,.]? *(19|20[0-9][0-9])/
|
16
|
+
# }
|
17
|
+
|
18
|
+
class FuzzyDateScanner < FuzzyScanner
|
19
|
+
|
20
|
+
attr_accessor :format
|
21
|
+
|
22
|
+
def initialize(format, regex)
|
23
|
+
@format = format
|
24
|
+
@regex = regex
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'tre-ruby'
|
4
|
+
|
5
|
+
class Regexp
|
6
|
+
def +(r)
|
7
|
+
Regexp.new(source + r.source)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class FuzzyPrice
|
12
|
+
|
13
|
+
FUZZY_PRICE_REGEX = /\$?([1-9]*[0-9])[._-—]([0-9][0-9])/
|
14
|
+
TOTAL_TEXT_REGEX = /(Total|TOTAL|Total Applied) +/
|
15
|
+
|
16
|
+
attr_accessor :max_fuzz, :price_regex
|
17
|
+
|
18
|
+
def initialize(max_fuzz=2)
|
19
|
+
@max_fuzz = max_fuzz
|
20
|
+
@fsub = FuzzySub.new FuzzySub::CHAR_2_NUM_SUB
|
21
|
+
end
|
22
|
+
|
23
|
+
def price_regex
|
24
|
+
@price_regex ||= FUZZY_PRICE_REGEX
|
25
|
+
end
|
26
|
+
|
27
|
+
def find_price(file, text_regex, fuzzy_thresh=2)
|
28
|
+
prices = []
|
29
|
+
regex = text_regex + FUZZY_PRICE_REGEX
|
30
|
+
|
31
|
+
words = file.split("\n")
|
32
|
+
words.each do |word|
|
33
|
+
# p "check word => #{word}"
|
34
|
+
|
35
|
+
matches = word.extend(TRE).ascan regex, TRE.fuzziness(fuzzy_thresh)
|
36
|
+
matches.each do |match|
|
37
|
+
|
38
|
+
# match the price regex separately again to improve accuracy
|
39
|
+
match = (match[0].extend(TRE).ascan price_regex, TRE.fuzziness(fuzzy_thresh))[0]
|
40
|
+
next if !match
|
41
|
+
|
42
|
+
a = match[1]
|
43
|
+
b = match[2]
|
44
|
+
# p match
|
45
|
+
|
46
|
+
a = @fsub.fsub!(a)
|
47
|
+
b = @fsub.fsub!(b)
|
48
|
+
|
49
|
+
next if a.empty? || b.empty?
|
50
|
+
|
51
|
+
w = "#{a}.#{b}"
|
52
|
+
|
53
|
+
# remove $
|
54
|
+
w.gsub!("$", "")
|
55
|
+
|
56
|
+
# p "---> #{w}"
|
57
|
+
prices << w.to_f
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
prices
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_price(string, regex)
|
65
|
+
prices = []
|
66
|
+
fuzz = 1
|
67
|
+
while fuzz <= @max_fuzz do
|
68
|
+
prices = find_price string, regex, fuzz
|
69
|
+
break if !prices.empty?
|
70
|
+
fuzz = fuzz + 1
|
71
|
+
end
|
72
|
+
prices
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
module FuzzyPriceString
|
78
|
+
|
79
|
+
def to_price(regex, max_fuzz=2)
|
80
|
+
FuzzyPrice.new(max_fuzz).to_price self, regex
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'tre-ruby'
|
2
|
+
|
3
|
+
class FuzzyScanner
|
4
|
+
|
5
|
+
attr_accessor :regex
|
6
|
+
|
7
|
+
# allow fuzziness of 2 by default
|
8
|
+
def fscan!(str, fuzziness=2)
|
9
|
+
str.gsub!(/\n/, " ")
|
10
|
+
words = str.extend(TRE).ascan regex, TRE.fuzziness(fuzziness)
|
11
|
+
words.uniq
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
class FuzzySub
|
2
|
+
|
3
|
+
CHAR_2_NUM_SUB = {
|
4
|
+
"A" => "4",
|
5
|
+
"OoD" => "0",
|
6
|
+
"liI," => "1",
|
7
|
+
"q" => "4" #could be 9
|
8
|
+
}
|
9
|
+
|
10
|
+
attr_accessor :sub_hash
|
11
|
+
|
12
|
+
def initialize(sub_hash)
|
13
|
+
@sub_hash = sub_hash
|
14
|
+
end
|
15
|
+
|
16
|
+
def fsub!(word)
|
17
|
+
sub_hash.each do |k, v|
|
18
|
+
word.gsub!(/[#{k}]/, v)
|
19
|
+
end
|
20
|
+
word
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
module FuzzySubString
|
26
|
+
|
27
|
+
def fsub!(sub_hash)
|
28
|
+
fs = FuzzySub.new sub_hash
|
29
|
+
fs.fsub! self
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class FuzzyWords
|
2
|
+
|
3
|
+
attr_accessor :max_fuzz
|
4
|
+
|
5
|
+
def initialize(max_fuzz=4)
|
6
|
+
@max_fuzz = max_fuzz
|
7
|
+
end
|
8
|
+
|
9
|
+
def find_words(file, words)
|
10
|
+
words.map do |w|
|
11
|
+
matches = find_word file, w
|
12
|
+
{ word: w, matches: matches } if !matches.empty?
|
13
|
+
end.compact
|
14
|
+
end
|
15
|
+
|
16
|
+
def find_word(text, word)
|
17
|
+
matches = []
|
18
|
+
fuzz = 1
|
19
|
+
while fuzz <= @max_fuzz do
|
20
|
+
matches = text.extend(TRE).ascan word, TRE.fuzziness(fuzz)
|
21
|
+
break if !matches.empty?
|
22
|
+
fuzz = fuzz + 1
|
23
|
+
end
|
24
|
+
matches
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
module FuzzyWordsString
|
30
|
+
|
31
|
+
def fuzzy_match_words(words, max_fuzz=4)
|
32
|
+
FuzzyWords.new(max_fuzz).find_words(self, words)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzyDateScanner do
|
4
|
+
|
5
|
+
before do
|
6
|
+
# 03/18/2014 or 3/18/2014
|
7
|
+
@fs1 = FuzzyDateScanner.new "%m/%d/%Y", /([0 ][1-9]|1[012])[-\/.~X](0[1-9]|[12][0-9]|3[01])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/
|
8
|
+
|
9
|
+
# 18/03/2014
|
10
|
+
@fs2 = FuzzyDateScanner.new "%d/%m/%Y", /(0[1-9]|[12][0-9]|3[01])[-\/.~X](0[1-9]|1[012])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "Fuzzily find matching date!" do
|
14
|
+
|
15
|
+
describe "finds the matches" do
|
16
|
+
|
17
|
+
describe "m/d/Y" do
|
18
|
+
it "finds perfectly matched text for %m/%d/%y with fuzziness 0" do
|
19
|
+
matches = @fs1.fscan!("03/18/14", 0)
|
20
|
+
matches.length.must_equal 1
|
21
|
+
matches[0][0].must_equal "03/18/14"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "finds perfectly matched text for %m/%d/%Y with fuzziness 0" do
|
25
|
+
matches = @fs1.fscan!("03/18/2014", 0)
|
26
|
+
matches.length.must_equal 1
|
27
|
+
matches[0][0].must_equal "03/18/2014"
|
28
|
+
end
|
29
|
+
|
30
|
+
it "finds 1-edit matched text with fuzziness 1" do
|
31
|
+
matches = @fs1.fscan!("03/18/201A", 1)
|
32
|
+
matches.length.must_equal 1
|
33
|
+
matches[0][0].must_equal "03/18/201A"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "d/m/Y" do
|
38
|
+
it "finds perfectly matched text for %d/%m/%y with fuzziness 0" do
|
39
|
+
matches = @fs2.fscan!("23/08/14", 0)
|
40
|
+
matches.length.must_equal 1
|
41
|
+
matches[0][0].must_equal "23/08/14"
|
42
|
+
end
|
43
|
+
|
44
|
+
it "finds perfectly matched text for %d/%m/%Y with fuzziness 0" do
|
45
|
+
matches = @fs2.fscan!("13/08/2014", 0)
|
46
|
+
matches.length.must_equal 1
|
47
|
+
matches[0][0].must_equal "13/08/2014"
|
48
|
+
end
|
49
|
+
|
50
|
+
it "finds 1-edit matched text with fuzziness 1" do
|
51
|
+
matches = @fs2.fscan!("18/01/201A", 1)
|
52
|
+
matches.length.must_equal 1
|
53
|
+
matches[0][0].must_equal "18/01/201A"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "cannot find the matches" do
|
60
|
+
|
61
|
+
describe "m/d/Y" do
|
62
|
+
it "cannot find 1-subbed text with fuzziness 0 (by default one sub costs 1)" do
|
63
|
+
matches = @fs1.fscan!("18/18/14", 0)
|
64
|
+
matches.must_be_empty
|
65
|
+
end
|
66
|
+
|
67
|
+
it "cannot find 2-subbed text with fuzziness 1 (by default one sub costs 1)" do
|
68
|
+
matches = @fs1.fscan!("18/18/I4", 1)
|
69
|
+
matches.must_be_empty
|
70
|
+
end
|
71
|
+
|
72
|
+
it "cannot find 2-subbed text with fuzziness 1 (by default one sub costs 1)" do
|
73
|
+
matches = @fs1.fscan!(" 3/I6/2DI4", 1)
|
74
|
+
matches.must_be_empty
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
describe "d/m/Y" do
|
79
|
+
it "cannot find 1-subbed text with fuzziness 0 (by default one sub costs 1)" do
|
80
|
+
matches = @fs2.fscan!("08/18/14", 0)
|
81
|
+
matches.must_be_empty
|
82
|
+
end
|
83
|
+
|
84
|
+
it "cannot find 2-subbed text with fuzziness 1 (by default one sub costs 1)" do
|
85
|
+
matches = @fs2.fscan!("08/18/I4", 1)
|
86
|
+
matches.must_be_empty
|
87
|
+
end
|
88
|
+
|
89
|
+
it "cannot find 2-subbed text with fuzziness 1 (by default one sub costs 1)" do
|
90
|
+
matches = @fs2.fscan!("13/I6/20I4", 1)
|
91
|
+
matches.must_be_empty
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzyDate do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@sample1 = "Re9 Trans Dafe/TIme CashIer\n003 7269 2014-O4-1q l8:21 O1972808\n610984528 BCCESSORIES $20,00 $12 DO\nTrans DIscounf $8.00"
|
7
|
+
@sample2 = " 4/17/2014 1:23 PM }"
|
8
|
+
@sample3 = "4ApR2O,4"
|
9
|
+
@fd = FuzzyDate.new 1
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
describe "it scans the dates from text" do
|
14
|
+
|
15
|
+
it "gets dates matches for %Y-%m-%d with default fuzziness 1" do
|
16
|
+
matches = @fd.fscan @sample1
|
17
|
+
matches.length.must_equal 2
|
18
|
+
matches[0][1].must_equal "%Y-%m-%d"
|
19
|
+
matches[1][0][0][0].must_equal "21 O19" #unfortunately scanned a wrong date becoz fuzzy is 2
|
20
|
+
end
|
21
|
+
|
22
|
+
it "gets dates matches for %d%b%Y with default fuzziness 1" do
|
23
|
+
matches = @fd.fscan @sample3
|
24
|
+
matches.length.must_equal 1
|
25
|
+
matches[0][1].must_equal "%d%b%Y"
|
26
|
+
matches[0][0][0][0].must_equal "4ApR2O,4" #unfortunately scanned a wrong date becoz fuzzy is 2
|
27
|
+
end
|
28
|
+
|
29
|
+
it "gets dates matches for %m/%d/%Y with fuzziness 1" do
|
30
|
+
matches = @fd.fscan @sample2, 1
|
31
|
+
matches.length.must_equal 1
|
32
|
+
matches[0][1].must_equal "%m/%d/%Y"
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "it returns date" do
|
38
|
+
|
39
|
+
it "scans and find the date with max fuzziness 1" do
|
40
|
+
@fd.max_fuzz = 3
|
41
|
+
dates = @fd.to_date @sample3
|
42
|
+
dates.length.must_equal 1
|
43
|
+
dates[0].to_s.must_equal "2014-04-04"
|
44
|
+
end
|
45
|
+
|
46
|
+
it "scans and find the date with max fuzziness 2" do
|
47
|
+
dates = @fd.to_date @sample2
|
48
|
+
dates.length.must_equal 1
|
49
|
+
dates[0].to_s.must_equal "2014-04-17"
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "it scans but cannot find date with fuzziness 1" do
|
55
|
+
|
56
|
+
it "scans and returns the date with max fuzziness 1" do
|
57
|
+
@fd.max_fuzz = 1
|
58
|
+
dates = @fd.to_date @sample1
|
59
|
+
dates.length.must_equal 0
|
60
|
+
end
|
61
|
+
|
62
|
+
it "scans and returns the date with max fuzziness 1" do
|
63
|
+
@fd.max_fuzz = 2
|
64
|
+
dates = @fd.to_date @sample1
|
65
|
+
dates.length.must_equal 1
|
66
|
+
dates[0].to_s.must_equal "2014-04-14"
|
67
|
+
end
|
68
|
+
|
69
|
+
it "string can extend this module scans and returns the date with max fuzziness 1" do
|
70
|
+
dates = @sample1.extend(FuzzyDateString).to_date 2
|
71
|
+
dates.length.must_equal 1
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'minitest_helper'
|
4
|
+
|
5
|
+
describe FuzzyPrice do
|
6
|
+
|
7
|
+
before do
|
8
|
+
@fp = FuzzyPrice.new 1
|
9
|
+
@sample0 = "Total 71.75"
|
10
|
+
@sample1 = ": % ` ’ ' Total . k` 27.32“"
|
11
|
+
@sample2 = "TofaI $26.18"
|
12
|
+
@sample3 = "T0faI 55.IA"
|
13
|
+
@sample4 = "T0faI 55.15"
|
14
|
+
@sample5 = "Total 35—O3"
|
15
|
+
@sample6 = "TOTAL $10.13"
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "Finds price total" do
|
19
|
+
|
20
|
+
it "finds price total with fuzziness 1" do
|
21
|
+
price = @fp.to_price @sample0, FuzzyPrice::TOTAL_TEXT_REGEX
|
22
|
+
price.length.must_equal 1
|
23
|
+
price[0].must_equal 71.75
|
24
|
+
end
|
25
|
+
|
26
|
+
it "finds price total with fuzziness 2" do
|
27
|
+
@fp.max_fuzz = 2
|
28
|
+
price = @fp.to_price @sample2, FuzzyPrice::TOTAL_TEXT_REGEX
|
29
|
+
price.length.must_equal 1
|
30
|
+
price[0].must_equal 26.18
|
31
|
+
|
32
|
+
price = @fp.to_price @sample5, FuzzyPrice::TOTAL_TEXT_REGEX
|
33
|
+
price.length.must_equal 1
|
34
|
+
price[0].must_equal 35.03
|
35
|
+
end
|
36
|
+
|
37
|
+
it "finds price total with fuzz 3" do
|
38
|
+
@fp.max_fuzz = 3
|
39
|
+
price = @fp.to_price @sample4, FuzzyPrice::TOTAL_TEXT_REGEX
|
40
|
+
price.length.must_equal 1
|
41
|
+
price[0].must_equal 55.15
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
describe "Cannot find price total" do
|
47
|
+
|
48
|
+
it "cannot find price total with fuzziness 1" do
|
49
|
+
@fp.max_fuzz = 1
|
50
|
+
price = @fp.to_price @sample3, FuzzyPrice::TOTAL_TEXT_REGEX
|
51
|
+
price.length.must_equal 0
|
52
|
+
end
|
53
|
+
|
54
|
+
it "cannot find price total with fuzz 2, text too messy" do
|
55
|
+
price = @fp.to_price @sample1, FuzzyPrice::TOTAL_TEXT_REGEX
|
56
|
+
price.length.must_equal 0
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzyScanner do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@fs = FuzzyScanner.new
|
7
|
+
@fs.regex = /([0 ][1-9]|1[012])[-\/.~X](0[1-9]|[12][0-9]|3[01])[-\/.~X]([0-9][0-9]$)/
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "Fuzzily find matching text!" do
|
11
|
+
|
12
|
+
describe "finds the matches" do
|
13
|
+
|
14
|
+
it "finds perfectly matched text with fuzziness 0" do
|
15
|
+
matches = @fs.fscan!("03/18/14", 0)
|
16
|
+
matches.length.must_equal 1
|
17
|
+
matches[0][0].must_equal "03/18/14"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "finds 1-edit matched text with fuzziness 1" do
|
21
|
+
matches = @fs.fscan!("03/I8/14", 1)
|
22
|
+
matches.length.must_equal 1
|
23
|
+
matches[0][0].must_equal "03/I8/14"
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
describe "cannot find the matches" do
|
29
|
+
|
30
|
+
it "cannot find 2-edit text with fuzziness 1" do
|
31
|
+
matches = @fs.fscan!("03/18/IA", 1)
|
32
|
+
matches.must_be_empty
|
33
|
+
end
|
34
|
+
|
35
|
+
it "cannot find matches with regex not matching" do
|
36
|
+
matches = @fs.fscan!("18/18/14", 0)
|
37
|
+
matches.must_be_empty
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzySub do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@fs = FuzzySub.new({ "A" => "4" })
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "Find text to sub!" do
|
10
|
+
|
11
|
+
it "must only sub the should subbed words" do
|
12
|
+
@fs.fsub!("O3/18/1A").must_equal "O3/18/14"
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "String extends fuzzy sub" do
|
18
|
+
|
19
|
+
it "String extends fuzzy sub should be able to do fuzzy sub!" do
|
20
|
+
"O3/18/1A".extend(FuzzySubString).fsub!({ "A" => "4" }).must_equal "O3/18/14"
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzyWords do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@fw = FuzzyWords.new
|
7
|
+
@words = ["JALAPENO CHILE", "CHILE D/ARBOL", "GRAPEFRUIT LARGE", "HASS SML AVOCADO", "MICHL KORS MS",
|
8
|
+
"LAUREN PETITE", "BROWN CHICKEN", "ASN/TAS DRIED NOODLE MED", "KOBE VENOMENON YOT"]
|
9
|
+
|
10
|
+
@test1 = " JALAPENO CHILE\n2.6l lb @ $O.69/lb $l.8O F\nCHILE D/ARBOL \nl.O8 lb @ $3,49/lb $3.77 F "
|
11
|
+
@test2 = " \n2.6l lb @ $O.69/lb $l.8O F\nCHIL \nl.O8 lb @ $3,49/lb $3.77 F "
|
12
|
+
@test3 = File.read File.expand_path("../../fixtures/sample1.txt", __FILE__)
|
13
|
+
@unreadable_test4 = File.read File.expand_path("../../fixtures/sample2.txt", __FILE__)
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "Find text matches words" do
|
17
|
+
|
18
|
+
it "finds the similar words" do
|
19
|
+
words = @fw.find_words(@test1, @words)
|
20
|
+
words.length.must_equal 2
|
21
|
+
end
|
22
|
+
|
23
|
+
it "finds the similar words" do
|
24
|
+
words = @fw.find_words(@test3, ["BUTTERFLY PRINT TOP:MULTI"])
|
25
|
+
words.length.must_equal 1
|
26
|
+
words[0][:word].must_equal "BUTTERFLY PRINT TOP:MULTI"
|
27
|
+
words[0][:matches].must_equal ["BUTTERFLY PRINT TOP:MULTI"]
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "Cannot find test matched words" do
|
33
|
+
|
34
|
+
it "does not find the similar words" do
|
35
|
+
words = @fw.find_words(@test2, @words)
|
36
|
+
words.length.must_equal 0
|
37
|
+
end
|
38
|
+
|
39
|
+
it "does not find any words with max_fuzz 7" do
|
40
|
+
@fw.max_fuzz = 7
|
41
|
+
words = @fw.find_words(@unreadable_test4, @words)
|
42
|
+
words.length.must_equal 0
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fuzzy-matcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Qi He
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-04-18 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! ' Fuzzy matcher looks for fuzzy matches such as words, digits, etc.
|
15
|
+
in a string of text using regex or string.
|
16
|
+
|
17
|
+
'
|
18
|
+
email: qihe229@gmail.com
|
19
|
+
executables: []
|
20
|
+
extensions: []
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- lib/fuzzy_matcher/fuzzy_date.rb
|
24
|
+
- lib/fuzzy_matcher/fuzzy_date_scanner.rb
|
25
|
+
- lib/fuzzy_matcher/fuzzy_price.rb
|
26
|
+
- lib/fuzzy_matcher/fuzzy_scanner.rb
|
27
|
+
- lib/fuzzy_matcher/fuzzy_sub.rb
|
28
|
+
- lib/fuzzy_matcher/fuzzy_words.rb
|
29
|
+
- lib/fuzzy_matcher/version.rb
|
30
|
+
- lib/fuzzy_matcher.rb
|
31
|
+
- spec/fuzzy_matcher/fuzzy_date_scanner_spec.rb
|
32
|
+
- spec/fuzzy_matcher/fuzzy_date_spec.rb
|
33
|
+
- spec/fuzzy_matcher/fuzzy_price_spec.rb
|
34
|
+
- spec/fuzzy_matcher/fuzzy_scanner_spec.rb
|
35
|
+
- spec/fuzzy_matcher/fuzzy_sub_spec.rb
|
36
|
+
- spec/fuzzy_matcher/fuzzy_words_spec.rb
|
37
|
+
- spec/minitest_helper.rb
|
38
|
+
homepage: http://github.com/he9qi/fuzzy_matcher
|
39
|
+
licenses:
|
40
|
+
- MIT
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options: []
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
none: false
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ! '>='
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
57
|
+
requirements: []
|
58
|
+
rubyforge_project:
|
59
|
+
rubygems_version: 1.8.23
|
60
|
+
signing_key:
|
61
|
+
specification_version: 3
|
62
|
+
summary: Fuzzy matches words, digits, etc. in a string of text.
|
63
|
+
test_files:
|
64
|
+
- spec/fuzzy_matcher/fuzzy_date_scanner_spec.rb
|
65
|
+
- spec/fuzzy_matcher/fuzzy_date_spec.rb
|
66
|
+
- spec/fuzzy_matcher/fuzzy_price_spec.rb
|
67
|
+
- spec/fuzzy_matcher/fuzzy_scanner_spec.rb
|
68
|
+
- spec/fuzzy_matcher/fuzzy_sub_spec.rb
|
69
|
+
- spec/fuzzy_matcher/fuzzy_words_spec.rb
|
70
|
+
- spec/minitest_helper.rb
|