fuzzy_match 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -0
- data/Gemfile +2 -2
- data/README.markdown +10 -2
- data/bin/fuzzy_match +11 -11
- data/lib/fuzzy_match.rb +1 -1
- data/lib/fuzzy_match/version.rb +1 -1
- metadata +2 -2
data/CHANGELOG
CHANGED
data/Gemfile
CHANGED
data/README.markdown
CHANGED
@@ -1,4 +1,12 @@
|
|
1
|
-
|
1
|
+
## Top 3 reasons you should use FuzzyMatch
|
2
|
+
|
3
|
+
1. *intelligent defaults*: it uses a combination of Pair Distance (2-gram) and Levenshtein Edit Distance to effectively match many examples with no configuration
|
4
|
+
2. *all-vs-all*: it takes care of finding the optimal match by comparing everything against everything else (when that's necessary)
|
5
|
+
3. *refinable*: you might get to 90% with no configuration, but if you need to go beyond you can use regexps, grouping, and stop words
|
6
|
+
|
7
|
+
It solves many mid-range matching problems — if your haystack is ~10k records — if you can winnow down the initial possibilities at the database level and only bring good contenders into app memory — why not give it a shot?
|
8
|
+
|
9
|
+
# FuzzyMatch
|
2
10
|
|
3
11
|
Find a needle in a haystack based on string similarity and regular expression rules.
|
4
12
|
|
@@ -12,7 +20,7 @@ Warning! `normalizers` are gone in version 2 and above! See the CHANGELOG and ch
|
|
12
20
|
|
13
21
|
>> require 'fuzzy_match'
|
14
22
|
=> true
|
15
|
-
>> FuzzyMatch.new(['seamus', 'andy', 'ben']).find('Shamus)
|
23
|
+
>> FuzzyMatch.new(['seamus', 'andy', 'ben']).find('Shamus')
|
16
24
|
=> "seamus"
|
17
25
|
|
18
26
|
See also the blog post [Fuzzy match in Ruby](http://numbers.brighterplanet.com/2012/01/18/fuzzy-match-in-ruby/).
|
data/bin/fuzzy_match
CHANGED
@@ -18,15 +18,15 @@ require 'to_regexp'
|
|
18
18
|
class FuzzyMatch
|
19
19
|
class Cli < ::Thor
|
20
20
|
desc :match, "Print out matches between A and B, where A is haystack and B is a bunch of needles."
|
21
|
-
method_option :csv, default
|
22
|
-
method_option :a_col, default
|
23
|
-
method_option :b_col, default
|
24
|
-
method_option :downcase, default
|
25
|
-
method_option :groupings, default
|
26
|
-
method_option :rules, default
|
27
|
-
method_option :explain, default
|
28
|
-
method_option :grep, default
|
29
|
-
method_option :limit, default
|
21
|
+
method_option :csv, :default => false, :type => :boolean, :desc => "CSV output"
|
22
|
+
method_option :a_col, :default => 0, :type => :string, :desc => "Column name in A. Defaults to first column."
|
23
|
+
method_option :b_col, :default => 0, :type => :string, :desc => "Column name in B. Defaults to first column."
|
24
|
+
method_option :downcase, :default => true, :type => :boolean, :desc => "Whether to downcase everything (except regexes, where you have to do /foo/i)"
|
25
|
+
method_option :groupings, :default => nil, :type => :string, :desc => "Spreadsheet with groupings - no headers, multi-part groupings on the same row"
|
26
|
+
method_option :rules, :default => nil, :type => :string, :desc => "Spreadsheet with headers: stop_words, identities, find_options. Listing a find_option like must_match_grouping makes it true."
|
27
|
+
method_option :explain, :default => false, :type => :boolean
|
28
|
+
method_option :grep, :default => nil, :type => :string
|
29
|
+
method_option :limit, :default => 1.0/0, :type => :numeric
|
30
30
|
def match(a_url, b_url)
|
31
31
|
puts "Checking matches using fuzzy_match version #{FuzzyMatch::VERSION}..."
|
32
32
|
fz = mkfz a_url
|
@@ -84,12 +84,12 @@ class FuzzyMatch
|
|
84
84
|
def fz_options
|
85
85
|
memo = {}
|
86
86
|
if options.groupings
|
87
|
-
memo[:groupings] = RemoteTable.new(options.groupings, headers
|
87
|
+
memo[:groupings] = RemoteTable.new(options.groupings, :headers => false).map do |row|
|
88
88
|
row.to_a.select(&:present?).map { |v| v.to_regexp(detect: true) }
|
89
89
|
end
|
90
90
|
end
|
91
91
|
if options.rules
|
92
|
-
t = RemoteTable.new(options.rules, headers
|
92
|
+
t = RemoteTable.new(options.rules, :headers => :first_row)
|
93
93
|
find_options = t.rows.map { |row| row['find_options'] }
|
94
94
|
memo.merge!(
|
95
95
|
identities: t.rows.map { |row| row['identities'] }.select(&:present?).map { |v| v.to_regexp(detect: true) },
|
data/lib/fuzzy_match.rb
CHANGED
data/lib/fuzzy_match/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fuzzy_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-09-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: active_record_inline_schema
|