perseus_match 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to perseus_match version 0.0.1
5
+ This documentation refers to perseus_match version 0.0.2
6
6
 
7
7
 
8
8
  == DESCRIPTION
@@ -15,6 +15,12 @@ Fuzzy string matching based on linguistic analysis.
15
15
  * Jens Wille <mailto:jens.wille@uni-koeln.de>
16
16
 
17
17
 
18
+ == LINKS
19
+
20
+ * <http://prometheus.rubyforge.org/perseus_match>
21
+ * <http://github.com/blackwinter/perseus_match>
22
+
23
+
18
24
  == LICENSE AND COPYRIGHT
19
25
 
20
26
  Copyright (C) 2008 Cologne University of Applied Sciences,
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ begin
14
14
  :summary => %q{Fuzzy string matching based on linguistic analysis},
15
15
  :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
16
  :extra_files => FileList['[A-Z]*'].to_a,
17
- :dependencies => %w[]
17
+ :dependencies => [['ruby-nuggets', '>= 0.3.0']]
18
18
  }
19
19
  }}
20
20
  rescue LoadError
data/bin/perseus_match CHANGED
@@ -1,6 +1,10 @@
1
1
  #! /usr/bin/ruby
2
2
 
3
3
  require 'optparse'
4
+ require 'benchmark'
5
+
6
+ require 'rubygems'
7
+ require 'nuggets/numeric/duration'
4
8
 
5
9
  $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
10
 
@@ -10,8 +14,9 @@ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
10
14
  abort USAGE if ARGV.empty?
11
15
 
12
16
  options = {
13
- :sort => false,
14
- :threshold => 0
17
+ :stats => false,
18
+ :threshold => 0,
19
+ :sort => false
15
20
  }
16
21
 
17
22
  OptionParser.new { |opts|
@@ -20,6 +25,10 @@ OptionParser.new { |opts|
20
25
  opts.separator ''
21
26
  opts.separator 'Options:'
22
27
 
28
+ opts.on('--stats', 'Output some statistics at the end') {
29
+ options[:stats] = true
30
+ }
31
+
23
32
  opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
24
33
  options[:threshold] = t
25
34
  }
@@ -49,16 +58,37 @@ end
49
58
  PerseusMatch::TokenSet.tokenize(file)
50
59
 
51
60
  phrases = File.readlines(file).map { |line| line.chomp }
52
- threshold = options[:threshold]
53
61
 
54
- if options[:sort]
55
- require 'pp'
62
+ threshold, count, count_all = options[:threshold], 0, 0
63
+
64
+ time = Benchmark.realtime {
65
+ if options[:sort]
66
+ require 'pp'
67
+
68
+ pp PerseusMatch::Cluster.new(phrases).sort { |pm|
69
+ if pm.similarity >= threshold
70
+ [pm.target, pm.distance, pm.similarity]
71
+ count += 1
72
+ end
73
+ count_all += 1
74
+ }.compact
75
+ else
76
+ PerseusMatch::List.pair(phrases) { |pm|
77
+ if pm.similarity >= threshold
78
+ p [pm.phrase, pm.target, pm.distance, pm.similarity]
79
+ count += 1
80
+ end
81
+ count_all += 1
82
+ }
83
+ end
84
+ }
85
+
86
+ if options[:stats]
87
+ hms, x, y = time.to_hms(2), time / count, time / count_all
56
88
 
57
- pp PerseusMatch::Cluster.new(phrases).sort { |m|
58
- [m.target, m.distance, m.similarity] if m.similarity >= threshold
59
- }.compact
60
- else
61
- PerseusMatch::List.pair(phrases) { |pm|
62
- p [pm.phrase, pm.target, pm.distance, pm.similarity] if pm.similarity >= threshold
63
- }
89
+ precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
90
+
91
+ warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
92
+ phrases.size, count, count_all, hms, x, y
93
+ ]
64
94
  end
@@ -1,3 +1,31 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
1
29
  class PerseusMatch
2
30
 
3
31
  class Cluster < Hash
@@ -17,7 +45,7 @@ class PerseusMatch
17
45
  def sort_by(attribute, *args, &block)
18
46
  options = args.last.is_a?(Hash) ? args.pop : {}
19
47
 
20
- map { |phrase, matches|
48
+ _ = map { |phrase, matches|
21
49
  res = {}
22
50
 
23
51
  matches = matches.sort_by { |match|
@@ -47,6 +75,8 @@ class PerseusMatch
47
75
 
48
76
  [phrase, matches]
49
77
  }.sort
78
+
79
+ _ # rcov hack :-(
50
80
  end
51
81
 
52
82
  def sort(options = {}, &block)
@@ -1,3 +1,31 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
1
29
  class PerseusMatch
2
30
 
3
31
  class List < Array
@@ -5,17 +33,13 @@ class PerseusMatch
5
33
  class << self
6
34
 
7
35
  def pair(phrases)
8
- if phrases.is_a?(self)
9
- phrases.each { |pm| yield pm }
10
- else
11
- phrases.uniq!
12
-
13
- phrases.each { |phrase|
14
- phrases.each { |target|
15
- yield PerseusMatch.new(phrase, target)
16
- }
36
+ phrases.uniq!
37
+
38
+ phrases.each { |phrase|
39
+ phrases.each { |target|
40
+ yield PerseusMatch.new(phrase, target)
17
41
  }
18
- end
42
+ }
19
43
  end
20
44
 
21
45
  end
@@ -1,3 +1,31 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
1
29
  $KCODE = 'u'
2
30
 
3
31
  LINGO_BASE = '/home/jw/devel/lingo/trunk'
@@ -4,7 +4,7 @@ class PerseusMatch
4
4
 
5
5
  MAJOR = 0
6
6
  MINOR = 0
7
- TINY = 1
7
+ TINY = 2
8
8
 
9
9
  class << self
10
10
 
data/lib/perseus_match.rb CHANGED
@@ -1,3 +1,31 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # perseus_match -- Fuzzy string matching based on linguistic analysis #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
1
29
  require 'perseus_match/list'
2
30
  require 'perseus_match/cluster'
3
31
  require 'perseus_match/token_set'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: perseus_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,10 +9,19 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-08-13 00:00:00 +02:00
12
+ date: 2008-08-15 00:00:00 +02:00
13
13
  default_executable:
14
- dependencies: []
15
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: ruby-nuggets
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.3.0
24
+ version:
16
25
  description: Fuzzy string matching based on linguistic analysis
17
26
  email: jens.wille@uni-koeln.de
18
27
  executables: