perseus_match 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README +1 -1
- data/bin/perseus_match +43 -9
- data/lib/perseus_match/token_set.rb +12 -6
- data/lib/perseus_match/version.rb +1 -1
- metadata +2 -2
data/README
CHANGED
data/bin/perseus_match
CHANGED
@@ -8,6 +8,7 @@ require 'set'
|
|
8
8
|
require 'rubygems'
|
9
9
|
require 'nuggets/enumerable/minmax'
|
10
10
|
require 'nuggets/numeric/duration'
|
11
|
+
require 'nuggets/string/evaluate'
|
11
12
|
|
12
13
|
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
13
14
|
|
@@ -26,6 +27,7 @@ options = {
|
|
26
27
|
:minimal => false,
|
27
28
|
:separate => false,
|
28
29
|
:lingo => false,
|
30
|
+
:format => nil,
|
29
31
|
:check => false,
|
30
32
|
:failed_only => false,
|
31
33
|
:align => false,
|
@@ -84,6 +86,10 @@ OptionParser.new { |opts|
|
|
84
86
|
options[:lingo] = true
|
85
87
|
}
|
86
88
|
|
89
|
+
opts.on('-F', '--format FORMAT', 'Custom output format. Available placeholders:', ' %p = phrase', ' %P = phrase, CSV-ready', ' %t = target', ' %T = target, CSV-ready', ' %d = distance', ' %s = similarity') { |f|
|
90
|
+
options[:format] = f
|
91
|
+
}
|
92
|
+
|
87
93
|
opts.separator ' '
|
88
94
|
opts.separator ' * Checking pairs'
|
89
95
|
opts.separator ' '
|
@@ -265,10 +271,37 @@ action = if options[:check]
|
|
265
271
|
_action
|
266
272
|
end
|
267
273
|
else
|
268
|
-
format =
|
269
|
-
|
270
|
-
|
271
|
-
|
274
|
+
format = if _format = options[:format]
|
275
|
+
substitutions = {
|
276
|
+
'p' => ['#{pm.phrase}', 's'],
|
277
|
+
'P' => ['"#{pm.phrase.gsub(/"/, %q{""})}"', 's'],
|
278
|
+
't' => ['#{pm.target}', 's'],
|
279
|
+
'T' => ['"#{pm.target.gsub(/"/, %q{""})}"', 's'],
|
280
|
+
'd' => ['#{pm.distance}', 'd'],
|
281
|
+
's' => ['#{pm.similarity}', 'f']
|
282
|
+
}
|
283
|
+
|
284
|
+
lambda { |pm|
|
285
|
+
_format.gsub(/(%-?[.\d]*)([pPtTds])/) {
|
286
|
+
value, field = substitutions[$2]
|
287
|
+
"#{$1}#{field}" % value.evaluate(binding)
|
288
|
+
}
|
289
|
+
}
|
290
|
+
else
|
291
|
+
if options[:lingo]
|
292
|
+
if options[:minimal]
|
293
|
+
lambda { |pm| ["#{pm.phrase}*#{pm.target}", "#{pm.target}*#{pm.phrase}"] }
|
294
|
+
else
|
295
|
+
lambda { |pm| "#{pm.phrase}*#{pm.target}" }
|
296
|
+
end
|
297
|
+
else
|
298
|
+
if options[:sort]
|
299
|
+
lambda { |pm| " #{[pm.target, pm.distance, pm.similarity].inspect}" }
|
300
|
+
else
|
301
|
+
lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
272
305
|
|
273
306
|
if options[:sort]
|
274
307
|
lambda {
|
@@ -290,13 +323,14 @@ else
|
|
290
323
|
PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
|
291
324
|
count_all += 1
|
292
325
|
|
293
|
-
if separator && pm.phrase != previous_phrase ||= pm.phrase
|
294
|
-
puts separator
|
295
|
-
previous_phrase = pm.phrase
|
296
|
-
end
|
297
|
-
|
298
326
|
if pm.similarity >= threshold
|
299
327
|
count += 1
|
328
|
+
|
329
|
+
if separator && pm.phrase != previous_phrase ||= pm.phrase
|
330
|
+
puts separator
|
331
|
+
previous_phrase = pm.phrase
|
332
|
+
end
|
333
|
+
|
300
334
|
puts format[pm]
|
301
335
|
end
|
302
336
|
}
|
@@ -81,9 +81,13 @@ class PerseusMatch
|
|
81
81
|
def self.tokenize(form, unknowns = false)
|
82
82
|
return @tokens[form] if @tokens
|
83
83
|
|
84
|
-
@_tokens, @tokens = {}, Hash.new { |h, k|
|
85
|
-
|
86
|
-
|
84
|
+
@_tokens, @tokens = {}, Hash.new { |h, k|
|
85
|
+
h[k] = new(
|
86
|
+
k, (@_tokens[k] || []) | (
|
87
|
+
k.scan(/\w+/) + k.scan(/[\w-]+/)
|
88
|
+
).map { |i| @_tokens[i] }.flatten.compact
|
89
|
+
)
|
90
|
+
}
|
87
91
|
|
88
92
|
parse = lambda { |x|
|
89
93
|
x.each_line { |res|
|
@@ -130,10 +134,12 @@ class PerseusMatch
|
|
130
134
|
file = temp.path
|
131
135
|
end
|
132
136
|
|
137
|
+
ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
|
138
|
+
|
133
139
|
begin
|
134
|
-
Dir.chdir(LINGO_BASE) {
|
135
|
-
#{
|
136
|
-
}
|
140
|
+
Dir.chdir(LINGO_BASE) {
|
141
|
+
parse[%x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}]
|
142
|
+
}
|
137
143
|
ensure
|
138
144
|
cfg.unlink
|
139
145
|
temp.unlink if temp
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: perseus_match
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-01-
|
12
|
+
date: 2009-01-26 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|