perseus_match 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to perseus_match version 0.0.1
5
+ This documentation refers to perseus_match version 0.0.2
6
6
 
7
7
 
8
8
  == DESCRIPTION
@@ -15,6 +15,12 @@ Fuzzy string matching based on linguistic analysis.
15
15
  * Jens Wille <mailto:jens.wille@uni-koeln.de>
16
16
 
17
17
 
18
+ == LINKS
19
+
20
+ * <http://prometheus.rubyforge.org/perseus_match>
21
+ * <http://github.com/blackwinter/perseus_match>
22
+
23
+
18
24
  == LICENSE AND COPYRIGHT
19
25
 
20
26
  Copyright (C) 2008 Cologne University of Applied Sciences,
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ begin
14
14
  :summary => %q{Fuzzy string matching based on linguistic analysis},
15
15
  :files => FileList['lib/**/*.rb', 'bin/*'].to_a,
16
16
  :extra_files => FileList['[A-Z]*'].to_a,
17
- :dependencies => %w[]
17
+ :dependencies => [['ruby-nuggets', '>= 0.3.0']]
18
18
  }
19
19
  }}
20
20
  rescue LoadError
data/bin/perseus_match CHANGED
@@ -1,6 +1,10 @@
1
1
  #! /usr/bin/ruby
2
2
 
3
3
  require 'optparse'
4
+ require 'benchmark'
5
+
6
+ require 'rubygems'
7
+ require 'nuggets/numeric/duration'
4
8
 
5
9
  $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
10
 
@@ -10,8 +14,9 @@ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
10
14
  abort USAGE if ARGV.empty?
11
15
 
12
16
  options = {
13
- :sort => false,
14
- :threshold => 0
17
+ :stats => false,
18
+ :threshold => 0,
19
+ :sort => false
15
20
  }
16
21
 
17
22
  OptionParser.new { |opts|
@@ -20,6 +25,10 @@ OptionParser.new { |opts|
20
25
  opts.separator ''
21
26
  opts.separator 'Options:'
22
27
 
28
+ opts.on('--stats', 'Output some statistics at the end') {
29
+ options[:stats] = true
30
+ }
31
+
23
32
  opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
24
33
  options[:threshold] = t
25
34
  }
@@ -49,16 +58,37 @@ end
49
58
  PerseusMatch::TokenSet.tokenize(file)
50
59
 
51
60
  phrases = File.readlines(file).map { |line| line.chomp }
52
- threshold = options[:threshold]
53
61
 
54
- if options[:sort]
55
- require 'pp'
62
+ threshold, count, count_all = options[:threshold], 0, 0
63
+
64
+ time = Benchmark.realtime {
65
+ if options[:sort]
66
+ require 'pp'
67
+
68
+ pp PerseusMatch::Cluster.new(phrases).sort { |pm|
69
+ if pm.similarity >= threshold
70
+ [pm.target, pm.distance, pm.similarity]
71
+ count += 1
72
+ end
73
+ count_all += 1
74
+ }.compact
75
+ else
76
+ PerseusMatch::List.pair(phrases) { |pm|
77
+ if pm.similarity >= threshold
78
+ p [pm.phrase, pm.target, pm.distance, pm.similarity]
79
+ count += 1
80
+ end
81
+ count_all += 1
82
+ }
83
+ end
84
+ }
85
+
86
+ if options[:stats]
87
+ hms, x, y = time.to_hms(2), time / count, time / count_all
56
88
 
57
- pp PerseusMatch::Cluster.new(phrases).sort { |m|
58
- [m.target, m.distance, m.similarity] if m.similarity >= threshold
59
- }.compact
60
- else
61
- PerseusMatch::List.pair(phrases) { |pm|
62
- p [pm.phrase, pm.target, pm.distance, pm.similarity] if pm.similarity >= threshold
63
- }
89
+ precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
90
+
91
+ warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
92
+ phrases.size, count, count_all, hms, x, y
93
+ ]
64
94
  end
@@ -1,3 +1,31 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
1
29
  class PerseusMatch
2
30
 
3
31
  class Cluster < Hash
@@ -17,7 +45,7 @@ class PerseusMatch
17
45
  def sort_by(attribute, *args, &block)
18
46
  options = args.last.is_a?(Hash) ? args.pop : {}
19
47
 
20
- map { |phrase, matches|
48
+ _ = map { |phrase, matches|
21
49
  res = {}
22
50
 
23
51
  matches = matches.sort_by { |match|
@@ -47,6 +75,8 @@ class PerseusMatch
47
75
 
48
76
  [phrase, matches]
49
77
  }.sort
78
+
79
+ _ # rcov hack :-(
50
80
  end
51
81
 
52
82
  def sort(options = {}, &block)
@@ -1,3 +1,31 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
1
29
  class PerseusMatch
2
30
 
3
31
  class List < Array
@@ -5,17 +33,13 @@ class PerseusMatch
5
33
  class << self
6
34
 
7
35
  def pair(phrases)
8
- if phrases.is_a?(self)
9
- phrases.each { |pm| yield pm }
10
- else
11
- phrases.uniq!
12
-
13
- phrases.each { |phrase|
14
- phrases.each { |target|
15
- yield PerseusMatch.new(phrase, target)
16
- }
36
+ phrases.uniq!
37
+
38
+ phrases.each { |phrase|
39
+ phrases.each { |target|
40
+ yield PerseusMatch.new(phrase, target)
17
41
  }
18
- end
42
+ }
19
43
  end
20
44
 
21
45
  end
@@ -1,3 +1,31 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of perseus_match, the fuzzy string matcher #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
1
29
  $KCODE = 'u'
2
30
 
3
31
  LINGO_BASE = '/home/jw/devel/lingo/trunk'
@@ -4,7 +4,7 @@ class PerseusMatch
4
4
 
5
5
  MAJOR = 0
6
6
  MINOR = 0
7
- TINY = 1
7
+ TINY = 2
8
8
 
9
9
  class << self
10
10
 
data/lib/perseus_match.rb CHANGED
@@ -1,3 +1,31 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # perseus_match -- Fuzzy string matching based on linguistic analysis #
5
+ # #
6
+ # Copyright (C) 2008 Cologne University of Applied Sciences #
7
+ # Claudiusstr. 1 #
8
+ # 50678 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # perseus_match is free software: you can redistribute it and/or modify it #
14
+ # under the terms of the GNU General Public License as published by the Free #
15
+ # Software Foundation, either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
+ # #
18
+ # perseus_match is distributed in the hope that it will be useful, but #
19
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY #
20
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License #
21
+ # for more details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with perseus_match. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
1
29
  require 'perseus_match/list'
2
30
  require 'perseus_match/cluster'
3
31
  require 'perseus_match/token_set'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: perseus_match
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,10 +9,19 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-08-13 00:00:00 +02:00
12
+ date: 2008-08-15 00:00:00 +02:00
13
13
  default_executable:
14
- dependencies: []
15
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: ruby-nuggets
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.3.0
24
+ version:
16
25
  description: Fuzzy string matching based on linguistic analysis
17
26
  email: jens.wille@uni-koeln.de
18
27
  executables: