perseus_match 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +7 -1
- data/Rakefile +1 -1
- data/bin/perseus_match +42 -12
- data/lib/perseus_match/cluster.rb +31 -1
- data/lib/perseus_match/list.rb +34 -10
- data/lib/perseus_match/token_set.rb +28 -0
- data/lib/perseus_match/version.rb +1 -1
- data/lib/perseus_match.rb +28 -0
- metadata +13 -4
data/README
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
== VERSION
|
4
4
|
|
5
|
-
This documentation refers to perseus_match version 0.0.
|
5
|
+
This documentation refers to perseus_match version 0.0.2
|
6
6
|
|
7
7
|
|
8
8
|
== DESCRIPTION
|
@@ -15,6 +15,12 @@ Fuzzy string matching based on linguistic analysis.
|
|
15
15
|
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
16
16
|
|
17
17
|
|
18
|
+
== LINKS
|
19
|
+
|
20
|
+
* <http://prometheus.rubyforge.org/perseus_match>
|
21
|
+
* <http://github.com/blackwinter/perseus_match>
|
22
|
+
|
23
|
+
|
18
24
|
== LICENSE AND COPYRIGHT
|
19
25
|
|
20
26
|
Copyright (C) 2008 Cologne University of Applied Sciences,
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ begin
|
|
14
14
|
:summary => %q{Fuzzy string matching based on linguistic analysis},
|
15
15
|
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
16
16
|
:extra_files => FileList['[A-Z]*'].to_a,
|
17
|
-
:dependencies =>
|
17
|
+
:dependencies => [['ruby-nuggets', '>= 0.3.0']]
|
18
18
|
}
|
19
19
|
}}
|
20
20
|
rescue LoadError
|
data/bin/perseus_match
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
#! /usr/bin/ruby
|
2
2
|
|
3
3
|
require 'optparse'
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'nuggets/numeric/duration'
|
4
8
|
|
5
9
|
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
10
|
|
@@ -10,8 +14,9 @@ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
|
|
10
14
|
abort USAGE if ARGV.empty?
|
11
15
|
|
12
16
|
options = {
|
13
|
-
:
|
14
|
-
:threshold => 0
|
17
|
+
:stats => false,
|
18
|
+
:threshold => 0,
|
19
|
+
:sort => false
|
15
20
|
}
|
16
21
|
|
17
22
|
OptionParser.new { |opts|
|
@@ -20,6 +25,10 @@ OptionParser.new { |opts|
|
|
20
25
|
opts.separator ''
|
21
26
|
opts.separator 'Options:'
|
22
27
|
|
28
|
+
opts.on('--stats', 'Output some statistics at the end') {
|
29
|
+
options[:stats] = true
|
30
|
+
}
|
31
|
+
|
23
32
|
opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
|
24
33
|
options[:threshold] = t
|
25
34
|
}
|
@@ -49,16 +58,37 @@ end
|
|
49
58
|
PerseusMatch::TokenSet.tokenize(file)
|
50
59
|
|
51
60
|
phrases = File.readlines(file).map { |line| line.chomp }
|
52
|
-
threshold = options[:threshold]
|
53
61
|
|
54
|
-
|
55
|
-
|
62
|
+
threshold, count, count_all = options[:threshold], 0, 0
|
63
|
+
|
64
|
+
time = Benchmark.realtime {
|
65
|
+
if options[:sort]
|
66
|
+
require 'pp'
|
67
|
+
|
68
|
+
pp PerseusMatch::Cluster.new(phrases).sort { |pm|
|
69
|
+
if pm.similarity >= threshold
|
70
|
+
[pm.target, pm.distance, pm.similarity]
|
71
|
+
count += 1
|
72
|
+
end
|
73
|
+
count_all += 1
|
74
|
+
}.compact
|
75
|
+
else
|
76
|
+
PerseusMatch::List.pair(phrases) { |pm|
|
77
|
+
if pm.similarity >= threshold
|
78
|
+
p [pm.phrase, pm.target, pm.distance, pm.similarity]
|
79
|
+
count += 1
|
80
|
+
end
|
81
|
+
count_all += 1
|
82
|
+
}
|
83
|
+
end
|
84
|
+
}
|
85
|
+
|
86
|
+
if options[:stats]
|
87
|
+
hms, x, y = time.to_hms(2), time / count, time / count_all
|
56
88
|
|
57
|
-
|
58
|
-
|
59
|
-
}
|
60
|
-
|
61
|
-
|
62
|
-
p [pm.phrase, pm.target, pm.distance, pm.similarity] if pm.similarity >= threshold
|
63
|
-
}
|
89
|
+
precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
|
90
|
+
|
91
|
+
warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
|
92
|
+
phrases.size, count, count_all, hms, x, y
|
93
|
+
]
|
64
94
|
end
|
@@ -1,3 +1,31 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of perseus_match, the fuzzy string matcher #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Cologne University of Applied Sciences #
|
7
|
+
# Claudiusstr. 1 #
|
8
|
+
# 50678 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# perseus_match is free software: you can redistribute it and/or modify it #
|
14
|
+
# under the terms of the GNU General Public License as published by the Free #
|
15
|
+
# Software Foundation, either version 3 of the License, or (at your option) #
|
16
|
+
# any later version. #
|
17
|
+
# #
|
18
|
+
# perseus_match is distributed in the hope that it will be useful, but #
|
19
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
|
20
|
+
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
|
21
|
+
# for more details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
1
29
|
class PerseusMatch
|
2
30
|
|
3
31
|
class Cluster < Hash
|
@@ -17,7 +45,7 @@ class PerseusMatch
|
|
17
45
|
def sort_by(attribute, *args, &block)
|
18
46
|
options = args.last.is_a?(Hash) ? args.pop : {}
|
19
47
|
|
20
|
-
map { |phrase, matches|
|
48
|
+
_ = map { |phrase, matches|
|
21
49
|
res = {}
|
22
50
|
|
23
51
|
matches = matches.sort_by { |match|
|
@@ -47,6 +75,8 @@ class PerseusMatch
|
|
47
75
|
|
48
76
|
[phrase, matches]
|
49
77
|
}.sort
|
78
|
+
|
79
|
+
_ # rcov hack :-(
|
50
80
|
end
|
51
81
|
|
52
82
|
def sort(options = {}, &block)
|
data/lib/perseus_match/list.rb
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of perseus_match, the fuzzy string matcher #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Cologne University of Applied Sciences #
|
7
|
+
# Claudiusstr. 1 #
|
8
|
+
# 50678 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# perseus_match is free software: you can redistribute it and/or modify it #
|
14
|
+
# under the terms of the GNU General Public License as published by the Free #
|
15
|
+
# Software Foundation, either version 3 of the License, or (at your option) #
|
16
|
+
# any later version. #
|
17
|
+
# #
|
18
|
+
# perseus_match is distributed in the hope that it will be useful, but #
|
19
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
|
20
|
+
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
|
21
|
+
# for more details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
1
29
|
class PerseusMatch
|
2
30
|
|
3
31
|
class List < Array
|
@@ -5,17 +33,13 @@ class PerseusMatch
|
|
5
33
|
class << self
|
6
34
|
|
7
35
|
def pair(phrases)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
phrases.
|
12
|
-
|
13
|
-
phrases.each { |phrase|
|
14
|
-
phrases.each { |target|
|
15
|
-
yield PerseusMatch.new(phrase, target)
|
16
|
-
}
|
36
|
+
phrases.uniq!
|
37
|
+
|
38
|
+
phrases.each { |phrase|
|
39
|
+
phrases.each { |target|
|
40
|
+
yield PerseusMatch.new(phrase, target)
|
17
41
|
}
|
18
|
-
|
42
|
+
}
|
19
43
|
end
|
20
44
|
|
21
45
|
end
|
@@ -1,3 +1,31 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of perseus_match, the fuzzy string matcher #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Cologne University of Applied Sciences #
|
7
|
+
# Claudiusstr. 1 #
|
8
|
+
# 50678 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# perseus_match is free software: you can redistribute it and/or modify it #
|
14
|
+
# under the terms of the GNU General Public License as published by the Free #
|
15
|
+
# Software Foundation, either version 3 of the License, or (at your option) #
|
16
|
+
# any later version. #
|
17
|
+
# #
|
18
|
+
# perseus_match is distributed in the hope that it will be useful, but #
|
19
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
|
20
|
+
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
|
21
|
+
# for more details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
1
29
|
$KCODE = 'u'
|
2
30
|
|
3
31
|
LINGO_BASE = '/home/jw/devel/lingo/trunk'
|
data/lib/perseus_match.rb
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# perseus_match -- Fuzzy string matching based on linguistic analysis #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 Cologne University of Applied Sciences #
|
7
|
+
# Claudiusstr. 1 #
|
8
|
+
# 50678 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# perseus_match is free software: you can redistribute it and/or modify it #
|
14
|
+
# under the terms of the GNU General Public License as published by the Free #
|
15
|
+
# Software Foundation, either version 3 of the License, or (at your option) #
|
16
|
+
# any later version. #
|
17
|
+
# #
|
18
|
+
# perseus_match is distributed in the hope that it will be useful, but #
|
19
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
|
20
|
+
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
|
21
|
+
# for more details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
1
29
|
require 'perseus_match/list'
|
2
30
|
require 'perseus_match/cluster'
|
3
31
|
require 'perseus_match/token_set'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: perseus_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,10 +9,19 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-08-
|
12
|
+
date: 2008-08-15 00:00:00 +02:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: ruby-nuggets
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.3.0
|
24
|
+
version:
|
16
25
|
description: Fuzzy string matching based on linguistic analysis
|
17
26
|
email: jens.wille@uni-koeln.de
|
18
27
|
executables:
|