fuzzy-matcher 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/fuzzy_matcher/fuzzy_date.rb +119 -0
- data/lib/fuzzy_matcher/fuzzy_date_scanner.rb +27 -0
- data/lib/fuzzy_matcher/fuzzy_price.rb +83 -0
- data/lib/fuzzy_matcher/fuzzy_scanner.rb +14 -0
- data/lib/fuzzy_matcher/fuzzy_sub.rb +33 -0
- data/lib/fuzzy_matcher/fuzzy_words.rb +37 -0
- data/lib/fuzzy_matcher/version.rb +5 -0
- data/lib/fuzzy_matcher.rb +6 -0
- data/spec/fuzzy_matcher/fuzzy_date_scanner_spec.rb +99 -0
- data/spec/fuzzy_matcher/fuzzy_date_spec.rb +77 -0
- data/spec/fuzzy_matcher/fuzzy_price_spec.rb +62 -0
- data/spec/fuzzy_matcher/fuzzy_scanner_spec.rb +44 -0
- data/spec/fuzzy_matcher/fuzzy_sub_spec.rb +25 -0
- data/spec/fuzzy_matcher/fuzzy_words_spec.rb +47 -0
- data/spec/minitest_helper.rb +4 -0
- metadata +70 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'date'
|
2
|
+
|
3
|
+
class FuzzyDate
|
4
|
+
|
5
|
+
FORMAT_2_REGEX = {
|
6
|
+
# 03/18/2014 or 3/18/2014
|
7
|
+
"%m/%d/%Y" => /(0?[1-9]|1[012])[-\/.~X](0?[1-9]|[12][0-9]|3[01])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/,
|
8
|
+
|
9
|
+
# 18/03/2014
|
10
|
+
"%d/%m/%Y" => /(0[1-9]|[12][0-9]|3[01])[-\/.~X](0[1-9]|1[012])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/,
|
11
|
+
|
12
|
+
# 2014-04-14
|
13
|
+
"%Y-%m-%d" => /(20[0-9][0-9])[-.~](0[1-9]|1[012])[-.~](0[1-9]|[12][0-9]|3[01])/,
|
14
|
+
|
15
|
+
# 10APR2014 or 4APR2014
|
16
|
+
"%d%b%Y" => /(0?[1-9]|[12][0-9]|3[01])(JAN|FEB|MAR|APR|ApR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(19|20[0-9][0-9])/,
|
17
|
+
|
18
|
+
# April 7, 2014
|
19
|
+
"%B %d, %Y" => /(January|February|March|April|May|June|July|August|September|October|November|December) *(0?[1-9]|[12][0-9]|3[01])[,.]? *(19|20[0-9][0-9])/
|
20
|
+
}
|
21
|
+
|
22
|
+
attr_accessor :max_fuzz
|
23
|
+
|
24
|
+
def initialize(max_fuzz=2)
|
25
|
+
@fsub = FuzzySub.new FuzzySub::CHAR_2_NUM_SUB
|
26
|
+
@max_fuzz = max_fuzz
|
27
|
+
@scanners = []
|
28
|
+
FORMAT_2_REGEX.each do |key, value|
|
29
|
+
register key, value
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def register(format, regex)
|
34
|
+
@scanners << FuzzyDateScanner.new(format, regex)
|
35
|
+
end
|
36
|
+
|
37
|
+
# allow fuzziness of 2 by default
|
38
|
+
def fscan(string, fuzziness=2)
|
39
|
+
@scanners.map do |fdscan|
|
40
|
+
matches = fdscan.fscan!(string, fuzziness)
|
41
|
+
[matches, fdscan.format] if !matches.empty?
|
42
|
+
end.compact
|
43
|
+
end
|
44
|
+
|
45
|
+
def validaterize(m, format)
|
46
|
+
str = m[0]
|
47
|
+
|
48
|
+
case format
|
49
|
+
when "%Y-%m-%d"
|
50
|
+
date = @fsub.fsub!(m[3])
|
51
|
+
mont = @fsub.fsub!(m[2])
|
52
|
+
year = @fsub.fsub!(m[1])
|
53
|
+
str = "#{year}-#{mont}-#{date}"
|
54
|
+
when "%m/%d/%Y"
|
55
|
+
date = @fsub.fsub!(m[2])
|
56
|
+
mont = @fsub.fsub!(m[1])
|
57
|
+
year = @fsub.fsub!(m[3])
|
58
|
+
format = "%m/%d/%y" if year.length < 4
|
59
|
+
str = "#{mont}/#{date}/#{year}"
|
60
|
+
when "%d/%m/%Y"
|
61
|
+
date = @fsub.fsub!(m[1])
|
62
|
+
mont = @fsub.fsub!(m[2])
|
63
|
+
year = @fsub.fsub!(m[3])
|
64
|
+
format = "%d/%m/%y" if year.length < 4
|
65
|
+
str = "#{date}/#{mont}/#{year}"
|
66
|
+
when "%d%b%Y"
|
67
|
+
date = @fsub.fsub!(m[1])
|
68
|
+
mont = m[2].upcase
|
69
|
+
year = @fsub.fsub!(m[3])
|
70
|
+
format = "%d%b%y" if year.length < 4
|
71
|
+
str = "#{date}#{mont}#{year}"
|
72
|
+
end
|
73
|
+
|
74
|
+
[str.strip, format]
|
75
|
+
end
|
76
|
+
|
77
|
+
def matches_to_dates(matches)
|
78
|
+
dates = []
|
79
|
+
matches.each do |m|
|
80
|
+
# p "#{self.class.to_s} match: #{m[0]} with format #{m[1]}"
|
81
|
+
strings = m[0]
|
82
|
+
format = m[1]
|
83
|
+
|
84
|
+
strings.each do |str|
|
85
|
+
k = validaterize str, format
|
86
|
+
# p "#{k[0]}, #{k[1]}"
|
87
|
+
begin
|
88
|
+
date = Date.strptime(k[0], k[1])
|
89
|
+
dates << date
|
90
|
+
rescue ArgumentError
|
91
|
+
# p "String #{k[0]} is not valide date for date format #{k[1]}"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
dates
|
96
|
+
end
|
97
|
+
|
98
|
+
# iteratively find the dates, try fuzziness 1 and then 2
|
99
|
+
def to_date(string)
|
100
|
+
dates = []
|
101
|
+
fuzz = 1
|
102
|
+
while fuzz <= @max_fuzz do
|
103
|
+
matches = fscan string, fuzz
|
104
|
+
dates = matches_to_dates matches
|
105
|
+
break if !dates.empty?
|
106
|
+
fuzz = fuzz + 1
|
107
|
+
end
|
108
|
+
dates
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
module FuzzyDateString
|
114
|
+
|
115
|
+
def to_date(max_fuzz=2)
|
116
|
+
FuzzyDate.new(max_fuzz).to_date self
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# {
|
2
|
+
# # 03/18/2014 or 3/18/2014
|
3
|
+
# "%m/%d/%Y" => /([0 ][1-9]|1[012])[-\/.~X](0[1-9]|[12][0-9]|3[01])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/,
|
4
|
+
#
|
5
|
+
# # 18/03/2014
|
6
|
+
# "%d/%m/%Y" => /(0[1-9]|[12][0-9]|3[01])[-\/.~X](0[1-9]|1[012])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/,
|
7
|
+
#
|
8
|
+
# # 2014-04-14
|
9
|
+
# "%Y-%m-%d" => /(20[0-9][0-9])[-.~](0[1-9]|1[012])[-.~](0[1-9]|[12][0-9]|3[01])/,
|
10
|
+
#
|
11
|
+
# # 10APR2014 or 4APR2014
|
12
|
+
# "%d%b%Y" => /([0 ][1-9]|[12][0-9]|3[01])(JAN|FEB|MAR|APR|ApR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(19|20[0-9][0-9])/,
|
13
|
+
#
|
14
|
+
# # April 7, 2014
|
15
|
+
# "%B %d, %Y" => /(January|February|March|April|May|June|July|August|September|October|November|December) *(0?[1-9]|[12][0-9]|3[01])[,.]? *(19|20[0-9][0-9])/
|
16
|
+
# }
|
17
|
+
|
18
|
+
class FuzzyDateScanner < FuzzyScanner
|
19
|
+
|
20
|
+
attr_accessor :format
|
21
|
+
|
22
|
+
def initialize(format, regex)
|
23
|
+
@format = format
|
24
|
+
@regex = regex
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'tre-ruby'
|
4
|
+
|
5
|
+
class Regexp
|
6
|
+
def +(r)
|
7
|
+
Regexp.new(source + r.source)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class FuzzyPrice
|
12
|
+
|
13
|
+
FUZZY_PRICE_REGEX = /\$?([1-9]*[0-9])[._-—]([0-9][0-9])/
|
14
|
+
TOTAL_TEXT_REGEX = /(Total|TOTAL|Total Applied) +/
|
15
|
+
|
16
|
+
attr_accessor :max_fuzz, :price_regex
|
17
|
+
|
18
|
+
def initialize(max_fuzz=2)
|
19
|
+
@max_fuzz = max_fuzz
|
20
|
+
@fsub = FuzzySub.new FuzzySub::CHAR_2_NUM_SUB
|
21
|
+
end
|
22
|
+
|
23
|
+
def price_regex
|
24
|
+
@price_regex ||= FUZZY_PRICE_REGEX
|
25
|
+
end
|
26
|
+
|
27
|
+
def find_price(file, text_regex, fuzzy_thresh=2)
|
28
|
+
prices = []
|
29
|
+
regex = text_regex + FUZZY_PRICE_REGEX
|
30
|
+
|
31
|
+
words = file.split("\n")
|
32
|
+
words.each do |word|
|
33
|
+
# p "check word => #{word}"
|
34
|
+
|
35
|
+
matches = word.extend(TRE).ascan regex, TRE.fuzziness(fuzzy_thresh)
|
36
|
+
matches.each do |match|
|
37
|
+
|
38
|
+
# match the price regex separately again to improve accuracy
|
39
|
+
match = (match[0].extend(TRE).ascan price_regex, TRE.fuzziness(fuzzy_thresh))[0]
|
40
|
+
next if !match
|
41
|
+
|
42
|
+
a = match[1]
|
43
|
+
b = match[2]
|
44
|
+
# p match
|
45
|
+
|
46
|
+
a = @fsub.fsub!(a)
|
47
|
+
b = @fsub.fsub!(b)
|
48
|
+
|
49
|
+
next if a.empty? || b.empty?
|
50
|
+
|
51
|
+
w = "#{a}.#{b}"
|
52
|
+
|
53
|
+
# remove $
|
54
|
+
w.gsub!("$", "")
|
55
|
+
|
56
|
+
# p "---> #{w}"
|
57
|
+
prices << w.to_f
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
prices
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_price(string, regex)
|
65
|
+
prices = []
|
66
|
+
fuzz = 1
|
67
|
+
while fuzz <= @max_fuzz do
|
68
|
+
prices = find_price string, regex, fuzz
|
69
|
+
break if !prices.empty?
|
70
|
+
fuzz = fuzz + 1
|
71
|
+
end
|
72
|
+
prices
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
module FuzzyPriceString
|
78
|
+
|
79
|
+
def to_price(regex, max_fuzz=2)
|
80
|
+
FuzzyPrice.new(max_fuzz).to_price self, regex
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'tre-ruby'
|
2
|
+
|
3
|
+
class FuzzyScanner
|
4
|
+
|
5
|
+
attr_accessor :regex
|
6
|
+
|
7
|
+
# allow fuzziness of 2 by default
|
8
|
+
def fscan!(str, fuzziness=2)
|
9
|
+
str.gsub!(/\n/, " ")
|
10
|
+
words = str.extend(TRE).ascan regex, TRE.fuzziness(fuzziness)
|
11
|
+
words.uniq
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
class FuzzySub
|
2
|
+
|
3
|
+
CHAR_2_NUM_SUB = {
|
4
|
+
"A" => "4",
|
5
|
+
"OoD" => "0",
|
6
|
+
"liI," => "1",
|
7
|
+
"q" => "4" #could be 9
|
8
|
+
}
|
9
|
+
|
10
|
+
attr_accessor :sub_hash
|
11
|
+
|
12
|
+
def initialize(sub_hash)
|
13
|
+
@sub_hash = sub_hash
|
14
|
+
end
|
15
|
+
|
16
|
+
def fsub!(word)
|
17
|
+
sub_hash.each do |k, v|
|
18
|
+
word.gsub!(/[#{k}]/, v)
|
19
|
+
end
|
20
|
+
word
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
module FuzzySubString
|
26
|
+
|
27
|
+
def fsub!(sub_hash)
|
28
|
+
fs = FuzzySub.new sub_hash
|
29
|
+
fs.fsub! self
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class FuzzyWords
|
2
|
+
|
3
|
+
attr_accessor :max_fuzz
|
4
|
+
|
5
|
+
def initialize(max_fuzz=4)
|
6
|
+
@max_fuzz = max_fuzz
|
7
|
+
end
|
8
|
+
|
9
|
+
def find_words(file, words)
|
10
|
+
words.map do |w|
|
11
|
+
matches = find_word file, w
|
12
|
+
{ word: w, matches: matches } if !matches.empty?
|
13
|
+
end.compact
|
14
|
+
end
|
15
|
+
|
16
|
+
def find_word(text, word)
|
17
|
+
matches = []
|
18
|
+
fuzz = 1
|
19
|
+
while fuzz <= @max_fuzz do
|
20
|
+
matches = text.extend(TRE).ascan word, TRE.fuzziness(fuzz)
|
21
|
+
break if !matches.empty?
|
22
|
+
fuzz = fuzz + 1
|
23
|
+
end
|
24
|
+
matches
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
module FuzzyWordsString
|
30
|
+
|
31
|
+
def fuzzy_match_words(words, max_fuzz=4)
|
32
|
+
FuzzyWords.new(max_fuzz).find_words(self, words)
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzyDateScanner do
|
4
|
+
|
5
|
+
before do
|
6
|
+
# 03/18/2014 or 3/18/2014
|
7
|
+
@fs1 = FuzzyDateScanner.new "%m/%d/%Y", /([0 ][1-9]|1[012])[-\/.~X](0[1-9]|[12][0-9]|3[01])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/
|
8
|
+
|
9
|
+
# 18/03/2014
|
10
|
+
@fs2 = FuzzyDateScanner.new "%d/%m/%Y", /(0[1-9]|[12][0-9]|3[01])[-\/.~X](0[1-9]|1[012])[-\/.~X](20[0-9][0-9]|[0-9][0-9]$)/
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "Fuzzily find matching date!" do
|
14
|
+
|
15
|
+
describe "finds the matches" do
|
16
|
+
|
17
|
+
describe "m/d/Y" do
|
18
|
+
it "finds perfectly matched text for %m/%d/%y with fuzziness 0" do
|
19
|
+
matches = @fs1.fscan!("03/18/14", 0)
|
20
|
+
matches.length.must_equal 1
|
21
|
+
matches[0][0].must_equal "03/18/14"
|
22
|
+
end
|
23
|
+
|
24
|
+
it "finds perfectly matched text for %m/%d/%Y with fuzziness 0" do
|
25
|
+
matches = @fs1.fscan!("03/18/2014", 0)
|
26
|
+
matches.length.must_equal 1
|
27
|
+
matches[0][0].must_equal "03/18/2014"
|
28
|
+
end
|
29
|
+
|
30
|
+
it "finds 1-edit matched text with fuzziness 1" do
|
31
|
+
matches = @fs1.fscan!("03/18/201A", 1)
|
32
|
+
matches.length.must_equal 1
|
33
|
+
matches[0][0].must_equal "03/18/201A"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "d/m/Y" do
|
38
|
+
it "finds perfectly matched text for %d/%m/%y with fuzziness 0" do
|
39
|
+
matches = @fs2.fscan!("23/08/14", 0)
|
40
|
+
matches.length.must_equal 1
|
41
|
+
matches[0][0].must_equal "23/08/14"
|
42
|
+
end
|
43
|
+
|
44
|
+
it "finds perfectly matched text for %d/%m/%Y with fuzziness 0" do
|
45
|
+
matches = @fs2.fscan!("13/08/2014", 0)
|
46
|
+
matches.length.must_equal 1
|
47
|
+
matches[0][0].must_equal "13/08/2014"
|
48
|
+
end
|
49
|
+
|
50
|
+
it "finds 1-edit matched text with fuzziness 1" do
|
51
|
+
matches = @fs2.fscan!("18/01/201A", 1)
|
52
|
+
matches.length.must_equal 1
|
53
|
+
matches[0][0].must_equal "18/01/201A"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "cannot find the matches" do
|
60
|
+
|
61
|
+
describe "m/d/Y" do
|
62
|
+
it "cannot find 1-subbed text with fuzziness 0 (by default one sub costs 1)" do
|
63
|
+
matches = @fs1.fscan!("18/18/14", 0)
|
64
|
+
matches.must_be_empty
|
65
|
+
end
|
66
|
+
|
67
|
+
it "cannot find 2-subbed text with fuzziness 1 (by default one sub costs 1)" do
|
68
|
+
matches = @fs1.fscan!("18/18/I4", 1)
|
69
|
+
matches.must_be_empty
|
70
|
+
end
|
71
|
+
|
72
|
+
it "cannot find 2-subbed text with fuzziness 1 (by default one sub costs 1)" do
|
73
|
+
matches = @fs1.fscan!(" 3/I6/2DI4", 1)
|
74
|
+
matches.must_be_empty
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
describe "d/m/Y" do
|
79
|
+
it "cannot find 1-subbed text with fuzziness 0 (by default one sub costs 1)" do
|
80
|
+
matches = @fs2.fscan!("08/18/14", 0)
|
81
|
+
matches.must_be_empty
|
82
|
+
end
|
83
|
+
|
84
|
+
it "cannot find 2-subbed text with fuzziness 1 (by default one sub costs 1)" do
|
85
|
+
matches = @fs2.fscan!("08/18/I4", 1)
|
86
|
+
matches.must_be_empty
|
87
|
+
end
|
88
|
+
|
89
|
+
it "cannot find 2-subbed text with fuzziness 1 (by default one sub costs 1)" do
|
90
|
+
matches = @fs2.fscan!("13/I6/20I4", 1)
|
91
|
+
matches.must_be_empty
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzyDate do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@sample1 = "Re9 Trans Dafe/TIme CashIer\n003 7269 2014-O4-1q l8:21 O1972808\n610984528 BCCESSORIES $20,00 $12 DO\nTrans DIscounf $8.00"
|
7
|
+
@sample2 = " 4/17/2014 1:23 PM }"
|
8
|
+
@sample3 = "4ApR2O,4"
|
9
|
+
@fd = FuzzyDate.new 1
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
describe "it scans the dates from text" do
|
14
|
+
|
15
|
+
it "gets dates matches for %Y-%m-%d with default fuzziness 1" do
|
16
|
+
matches = @fd.fscan @sample1
|
17
|
+
matches.length.must_equal 2
|
18
|
+
matches[0][1].must_equal "%Y-%m-%d"
|
19
|
+
matches[1][0][0][0].must_equal "21 O19" #unfortunately scanned a wrong date becoz fuzzy is 2
|
20
|
+
end
|
21
|
+
|
22
|
+
it "gets dates matches for %d%b%Y with default fuzziness 1" do
|
23
|
+
matches = @fd.fscan @sample3
|
24
|
+
matches.length.must_equal 1
|
25
|
+
matches[0][1].must_equal "%d%b%Y"
|
26
|
+
matches[0][0][0][0].must_equal "4ApR2O,4" #unfortunately scanned a wrong date becoz fuzzy is 2
|
27
|
+
end
|
28
|
+
|
29
|
+
it "gets dates matches for %m/%d/%Y with fuzziness 1" do
|
30
|
+
matches = @fd.fscan @sample2, 1
|
31
|
+
matches.length.must_equal 1
|
32
|
+
matches[0][1].must_equal "%m/%d/%Y"
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "it returns date" do
|
38
|
+
|
39
|
+
it "scans and find the date with max fuzziness 1" do
|
40
|
+
@fd.max_fuzz = 3
|
41
|
+
dates = @fd.to_date @sample3
|
42
|
+
dates.length.must_equal 1
|
43
|
+
dates[0].to_s.must_equal "2014-04-04"
|
44
|
+
end
|
45
|
+
|
46
|
+
it "scans and find the date with max fuzziness 2" do
|
47
|
+
dates = @fd.to_date @sample2
|
48
|
+
dates.length.must_equal 1
|
49
|
+
dates[0].to_s.must_equal "2014-04-17"
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "it scans but cannot find date with fuzziness 1" do
|
55
|
+
|
56
|
+
it "scans and returns the date with max fuzziness 1" do
|
57
|
+
@fd.max_fuzz = 1
|
58
|
+
dates = @fd.to_date @sample1
|
59
|
+
dates.length.must_equal 0
|
60
|
+
end
|
61
|
+
|
62
|
+
it "scans and returns the date with max fuzziness 1" do
|
63
|
+
@fd.max_fuzz = 2
|
64
|
+
dates = @fd.to_date @sample1
|
65
|
+
dates.length.must_equal 1
|
66
|
+
dates[0].to_s.must_equal "2014-04-14"
|
67
|
+
end
|
68
|
+
|
69
|
+
it "string can extend this module scans and returns the date with max fuzziness 1" do
|
70
|
+
dates = @sample1.extend(FuzzyDateString).to_date 2
|
71
|
+
dates.length.must_equal 1
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'minitest_helper'
|
4
|
+
|
5
|
+
describe FuzzyPrice do
|
6
|
+
|
7
|
+
before do
|
8
|
+
@fp = FuzzyPrice.new 1
|
9
|
+
@sample0 = "Total 71.75"
|
10
|
+
@sample1 = ": % ` ’ ' Total . k` 27.32“"
|
11
|
+
@sample2 = "TofaI $26.18"
|
12
|
+
@sample3 = "T0faI 55.IA"
|
13
|
+
@sample4 = "T0faI 55.15"
|
14
|
+
@sample5 = "Total 35—O3"
|
15
|
+
@sample6 = "TOTAL $10.13"
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "Finds price total" do
|
19
|
+
|
20
|
+
it "finds price total with fuzziness 1" do
|
21
|
+
price = @fp.to_price @sample0, FuzzyPrice::TOTAL_TEXT_REGEX
|
22
|
+
price.length.must_equal 1
|
23
|
+
price[0].must_equal 71.75
|
24
|
+
end
|
25
|
+
|
26
|
+
it "finds price total with fuzziness 2" do
|
27
|
+
@fp.max_fuzz = 2
|
28
|
+
price = @fp.to_price @sample2, FuzzyPrice::TOTAL_TEXT_REGEX
|
29
|
+
price.length.must_equal 1
|
30
|
+
price[0].must_equal 26.18
|
31
|
+
|
32
|
+
price = @fp.to_price @sample5, FuzzyPrice::TOTAL_TEXT_REGEX
|
33
|
+
price.length.must_equal 1
|
34
|
+
price[0].must_equal 35.03
|
35
|
+
end
|
36
|
+
|
37
|
+
it "finds price total with fuzz 3" do
|
38
|
+
@fp.max_fuzz = 3
|
39
|
+
price = @fp.to_price @sample4, FuzzyPrice::TOTAL_TEXT_REGEX
|
40
|
+
price.length.must_equal 1
|
41
|
+
price[0].must_equal 55.15
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
describe "Cannot find price total" do
|
47
|
+
|
48
|
+
it "cannot find price total with fuzziness 1" do
|
49
|
+
@fp.max_fuzz = 1
|
50
|
+
price = @fp.to_price @sample3, FuzzyPrice::TOTAL_TEXT_REGEX
|
51
|
+
price.length.must_equal 0
|
52
|
+
end
|
53
|
+
|
54
|
+
it "cannot find price total with fuzz 2, text too messy" do
|
55
|
+
price = @fp.to_price @sample1, FuzzyPrice::TOTAL_TEXT_REGEX
|
56
|
+
price.length.must_equal 0
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzyScanner do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@fs = FuzzyScanner.new
|
7
|
+
@fs.regex = /([0 ][1-9]|1[012])[-\/.~X](0[1-9]|[12][0-9]|3[01])[-\/.~X]([0-9][0-9]$)/
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "Fuzzily find matching text!" do
|
11
|
+
|
12
|
+
describe "finds the matches" do
|
13
|
+
|
14
|
+
it "finds perfectly matched text with fuzziness 0" do
|
15
|
+
matches = @fs.fscan!("03/18/14", 0)
|
16
|
+
matches.length.must_equal 1
|
17
|
+
matches[0][0].must_equal "03/18/14"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "finds 1-edit matched text with fuzziness 1" do
|
21
|
+
matches = @fs.fscan!("03/I8/14", 1)
|
22
|
+
matches.length.must_equal 1
|
23
|
+
matches[0][0].must_equal "03/I8/14"
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
describe "cannot find the matches" do
|
29
|
+
|
30
|
+
it "cannot find 2-edit text with fuzziness 1" do
|
31
|
+
matches = @fs.fscan!("03/18/IA", 1)
|
32
|
+
matches.must_be_empty
|
33
|
+
end
|
34
|
+
|
35
|
+
it "cannot find matches with regex not matching" do
|
36
|
+
matches = @fs.fscan!("18/18/14", 0)
|
37
|
+
matches.must_be_empty
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzySub do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@fs = FuzzySub.new({ "A" => "4" })
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "Find text to sub!" do
|
10
|
+
|
11
|
+
it "must only sub the should subbed words" do
|
12
|
+
@fs.fsub!("O3/18/1A").must_equal "O3/18/14"
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "String extends fuzzy sub" do
|
18
|
+
|
19
|
+
it "String extends fuzzy sub should be able to do fuzzy sub!" do
|
20
|
+
"O3/18/1A".extend(FuzzySubString).fsub!({ "A" => "4" }).must_equal "O3/18/14"
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'minitest_helper'
|
2
|
+
|
3
|
+
describe FuzzyWords do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@fw = FuzzyWords.new
|
7
|
+
@words = ["JALAPENO CHILE", "CHILE D/ARBOL", "GRAPEFRUIT LARGE", "HASS SML AVOCADO", "MICHL KORS MS",
|
8
|
+
"LAUREN PETITE", "BROWN CHICKEN", "ASN/TAS DRIED NOODLE MED", "KOBE VENOMENON YOT"]
|
9
|
+
|
10
|
+
@test1 = " JALAPENO CHILE\n2.6l lb @ $O.69/lb $l.8O F\nCHILE D/ARBOL \nl.O8 lb @ $3,49/lb $3.77 F "
|
11
|
+
@test2 = " \n2.6l lb @ $O.69/lb $l.8O F\nCHIL \nl.O8 lb @ $3,49/lb $3.77 F "
|
12
|
+
@test3 = File.read File.expand_path("../../fixtures/sample1.txt", __FILE__)
|
13
|
+
@unreadable_test4 = File.read File.expand_path("../../fixtures/sample2.txt", __FILE__)
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "Find text matches words" do
|
17
|
+
|
18
|
+
it "finds the similar words" do
|
19
|
+
words = @fw.find_words(@test1, @words)
|
20
|
+
words.length.must_equal 2
|
21
|
+
end
|
22
|
+
|
23
|
+
it "finds the similar words" do
|
24
|
+
words = @fw.find_words(@test3, ["BUTTERFLY PRINT TOP:MULTI"])
|
25
|
+
words.length.must_equal 1
|
26
|
+
words[0][:word].must_equal "BUTTERFLY PRINT TOP:MULTI"
|
27
|
+
words[0][:matches].must_equal ["BUTTERFLY PRINT TOP:MULTI"]
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "Cannot find test matched words" do
|
33
|
+
|
34
|
+
it "does not find the similar words" do
|
35
|
+
words = @fw.find_words(@test2, @words)
|
36
|
+
words.length.must_equal 0
|
37
|
+
end
|
38
|
+
|
39
|
+
it "does not find any words with max_fuzz 7" do
|
40
|
+
@fw.max_fuzz = 7
|
41
|
+
words = @fw.find_words(@unreadable_test4, @words)
|
42
|
+
words.length.must_equal 0
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fuzzy-matcher
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Qi He
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-04-18 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! ' Fuzzy matcher looks for fuzzy matches such as words, digits, etc.
|
15
|
+
in a string of text using regex or string.
|
16
|
+
|
17
|
+
'
|
18
|
+
email: qihe229@gmail.com
|
19
|
+
executables: []
|
20
|
+
extensions: []
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- lib/fuzzy_matcher/fuzzy_date.rb
|
24
|
+
- lib/fuzzy_matcher/fuzzy_date_scanner.rb
|
25
|
+
- lib/fuzzy_matcher/fuzzy_price.rb
|
26
|
+
- lib/fuzzy_matcher/fuzzy_scanner.rb
|
27
|
+
- lib/fuzzy_matcher/fuzzy_sub.rb
|
28
|
+
- lib/fuzzy_matcher/fuzzy_words.rb
|
29
|
+
- lib/fuzzy_matcher/version.rb
|
30
|
+
- lib/fuzzy_matcher.rb
|
31
|
+
- spec/fuzzy_matcher/fuzzy_date_scanner_spec.rb
|
32
|
+
- spec/fuzzy_matcher/fuzzy_date_spec.rb
|
33
|
+
- spec/fuzzy_matcher/fuzzy_price_spec.rb
|
34
|
+
- spec/fuzzy_matcher/fuzzy_scanner_spec.rb
|
35
|
+
- spec/fuzzy_matcher/fuzzy_sub_spec.rb
|
36
|
+
- spec/fuzzy_matcher/fuzzy_words_spec.rb
|
37
|
+
- spec/minitest_helper.rb
|
38
|
+
homepage: http://github.com/he9qi/fuzzy_matcher
|
39
|
+
licenses:
|
40
|
+
- MIT
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options: []
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
none: false
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ! '>='
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
57
|
+
requirements: []
|
58
|
+
rubyforge_project:
|
59
|
+
rubygems_version: 1.8.23
|
60
|
+
signing_key:
|
61
|
+
specification_version: 3
|
62
|
+
summary: Fuzzy matches words, digits, etc. in a string of text.
|
63
|
+
test_files:
|
64
|
+
- spec/fuzzy_matcher/fuzzy_date_scanner_spec.rb
|
65
|
+
- spec/fuzzy_matcher/fuzzy_date_spec.rb
|
66
|
+
- spec/fuzzy_matcher/fuzzy_price_spec.rb
|
67
|
+
- spec/fuzzy_matcher/fuzzy_scanner_spec.rb
|
68
|
+
- spec/fuzzy_matcher/fuzzy_sub_spec.rb
|
69
|
+
- spec/fuzzy_matcher/fuzzy_words_spec.rb
|
70
|
+
- spec/minitest_helper.rb
|