fuzzy_match 2.0.3 → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -0
- data/Gemfile +2 -2
- data/README.markdown +10 -2
- data/bin/fuzzy_match +11 -11
- data/lib/fuzzy_match.rb +1 -1
- data/lib/fuzzy_match/version.rb +1 -1
- metadata +2 -2
data/CHANGELOG
CHANGED
data/Gemfile
CHANGED
data/README.markdown
CHANGED
@@ -1,4 +1,12 @@
|
|
1
|
-
|
1
|
+
## Top 3 reasons you should use FuzzyMatch
|
2
|
+
|
3
|
+
1. *intelligent defaults*: it uses a combination of Pair Distance (2-gram) and Levenshtein Edit Distance to effectively match many examples with no configuration
|
4
|
+
2. *all-vs-all*: it takes care of finding the optimal match by comparing everything against everything else (when that's necessary)
|
5
|
+
3. *refinable*: you might get to 90% with no configuration, but if you need to go beyond you can use regexps, grouping, and stop words
|
6
|
+
|
7
|
+
It solves many mid-range matching problems — if your haystack is ~10k records — if you can winnow down the initial possibilities at the database level and only bring good contenders into app memory — why not give it a shot?
|
8
|
+
|
9
|
+
# FuzzyMatch
|
2
10
|
|
3
11
|
Find a needle in a haystack based on string similarity and regular expression rules.
|
4
12
|
|
@@ -12,7 +20,7 @@ Warning! `normalizers` are gone in version 2 and above! See the CHANGELOG and ch
|
|
12
20
|
|
13
21
|
>> require 'fuzzy_match'
|
14
22
|
=> true
|
15
|
-
>> FuzzyMatch.new(['seamus', 'andy', 'ben']).find('Shamus)
|
23
|
+
>> FuzzyMatch.new(['seamus', 'andy', 'ben']).find('Shamus')
|
16
24
|
=> "seamus"
|
17
25
|
|
18
26
|
See also the blog post [Fuzzy match in Ruby](http://numbers.brighterplanet.com/2012/01/18/fuzzy-match-in-ruby/).
|
data/bin/fuzzy_match
CHANGED
@@ -18,15 +18,15 @@ require 'to_regexp'
|
|
18
18
|
class FuzzyMatch
|
19
19
|
class Cli < ::Thor
|
20
20
|
desc :match, "Print out matches between A and B, where A is haystack and B is a bunch of needles."
|
21
|
-
method_option :csv, default
|
22
|
-
method_option :a_col, default
|
23
|
-
method_option :b_col, default
|
24
|
-
method_option :downcase, default
|
25
|
-
method_option :groupings, default
|
26
|
-
method_option :rules, default
|
27
|
-
method_option :explain, default
|
28
|
-
method_option :grep, default
|
29
|
-
method_option :limit, default
|
21
|
+
method_option :csv, :default => false, :type => :boolean, :desc => "CSV output"
|
22
|
+
method_option :a_col, :default => 0, :type => :string, :desc => "Column name in A. Defaults to first column."
|
23
|
+
method_option :b_col, :default => 0, :type => :string, :desc => "Column name in B. Defaults to first column."
|
24
|
+
method_option :downcase, :default => true, :type => :boolean, :desc => "Whether to downcase everything (except regexes, where you have to do /foo/i)"
|
25
|
+
method_option :groupings, :default => nil, :type => :string, :desc => "Spreadsheet with groupings - no headers, multi-part groupings on the same row"
|
26
|
+
method_option :rules, :default => nil, :type => :string, :desc => "Spreadsheet with headers: stop_words, identities, find_options. Listing a find_option like must_match_grouping makes it true."
|
27
|
+
method_option :explain, :default => false, :type => :boolean
|
28
|
+
method_option :grep, :default => nil, :type => :string
|
29
|
+
method_option :limit, :default => 1.0/0, :type => :numeric
|
30
30
|
def match(a_url, b_url)
|
31
31
|
puts "Checking matches using fuzzy_match version #{FuzzyMatch::VERSION}..."
|
32
32
|
fz = mkfz a_url
|
@@ -84,12 +84,12 @@ class FuzzyMatch
|
|
84
84
|
def fz_options
|
85
85
|
memo = {}
|
86
86
|
if options.groupings
|
87
|
-
memo[:groupings] = RemoteTable.new(options.groupings, headers
|
87
|
+
memo[:groupings] = RemoteTable.new(options.groupings, :headers => false).map do |row|
|
88
88
|
row.to_a.select(&:present?).map { |v| v.to_regexp(detect: true) }
|
89
89
|
end
|
90
90
|
end
|
91
91
|
if options.rules
|
92
|
-
t = RemoteTable.new(options.rules, headers
|
92
|
+
t = RemoteTable.new(options.rules, :headers => :first_row)
|
93
93
|
find_options = t.rows.map { |row| row['find_options'] }
|
94
94
|
memo.merge!(
|
95
95
|
identities: t.rows.map { |row| row['identities'] }.select(&:present?).map { |v| v.to_regexp(detect: true) },
|
data/lib/fuzzy_match.rb
CHANGED
data/lib/fuzzy_match/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-09-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: active_record_inline_schema
|