lingo 1.9.0.pre1 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/ChangeLog +18 -7
  3. data/README +6 -8
  4. data/Rakefile +5 -5
  5. data/dict/en/lingo-dic.txt +52625 -15693
  6. data/lang/en.lang +2 -2
  7. data/lib/lingo.rb +15 -3
  8. data/lib/lingo/array_utils.rb +39 -0
  9. data/lib/lingo/attendee.rb +1 -3
  10. data/lib/lingo/attendee/multi_worder.rb +4 -2
  11. data/lib/lingo/attendee/sequencer.rb +122 -73
  12. data/lib/lingo/attendee/text_writer.rb +4 -6
  13. data/lib/lingo/attendee/vector_filter.rb +5 -5
  14. data/lib/lingo/cli.rb +20 -2
  15. data/lib/lingo/config.rb +4 -3
  16. data/lib/lingo/ctl.rb +2 -20
  17. data/lib/lingo/ctl/analysis.rb +3 -5
  18. data/lib/lingo/ctl/files.rb +3 -3
  19. data/lib/lingo/database.rb +26 -25
  20. data/lib/lingo/database/crypter.rb +10 -6
  21. data/lib/lingo/database/source.rb +72 -25
  22. data/lib/lingo/database/source/key_value.rb +12 -8
  23. data/lib/lingo/database/source/multi_key.rb +11 -9
  24. data/lib/lingo/database/source/multi_value.rb +10 -8
  25. data/lib/lingo/database/source/single_word.rb +10 -6
  26. data/lib/lingo/database/source/word_class.rb +43 -14
  27. data/lib/lingo/debug.rb +2 -2
  28. data/lib/lingo/error.rb +21 -5
  29. data/lib/lingo/filter.rb +1 -1
  30. data/lib/lingo/language.rb +21 -21
  31. data/lib/lingo/language/grammar.rb +4 -2
  32. data/lib/lingo/language/lexical_hash.rb +2 -14
  33. data/lib/lingo/language/word.rb +1 -5
  34. data/lib/lingo/text_utils.rb +113 -20
  35. data/lib/lingo/version.rb +1 -1
  36. data/test/attendee/ts_sequencer.rb +286 -32
  37. data/test/attendee/ts_text_reader.rb +4 -4
  38. data/test/attendee/ts_text_writer.rb +19 -5
  39. data/test/test_helper.rb +2 -0
  40. data/test/ts_database.rb +213 -14
  41. metadata +36 -24
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -64,7 +64,8 @@ class Lingo
64
64
  opts.separator
65
65
 
66
66
  opts.option(:log__FILE, :L, 'Log file to print debug information to') { |log|
67
- options[:log] = stderr.reopen(log == '-' ? stdout : File.open(log, 'a+', encoding: ENC))
67
+ options[:log] = stderr.reopen(
68
+ log == '-' ? stdout : File.open(log, 'a+', encoding: ENCODING))
68
69
  }
69
70
 
70
71
  opts.separator
@@ -76,4 +77,21 @@ class Lingo
76
77
 
77
78
  end
78
79
 
80
+ def self.CLI(args, extra = nil, &block)
81
+ opt, req = args.partition { |arg| arg.sub!(/\?\z/, '') }
82
+
83
+ unless (n = ARGV.size - req.size) >= 0 && n <= opt.size
84
+ msg = "Usage: #{$0}#{args.map { |arg| [' ', arg].zip(
85
+ opt.include?(arg) ? %w[[ ]] : %w[< >]).join }.join}"
86
+
87
+ abort Array(extra).unshift(msg).join("\n\n")
88
+ end
89
+
90
+ Object.new.extend(TextUtils).instance_eval(&block)
91
+ rescue LingoError => err
92
+ abort err.to_s
93
+ ensure
94
+ ObjectSpace.each_object(Zlib::GzipWriter, &:close)
95
+ end
96
+
79
97
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -53,7 +53,7 @@ class Lingo
53
53
  end
54
54
  end
55
55
 
56
- attr_reader :language_file, :config_file
56
+ attr_reader :lang_file, :config_file
57
57
 
58
58
  def to_h
59
59
  { 'version' => VERSION }.merge(@opts)
@@ -128,7 +128,8 @@ class Lingo
128
128
  file = Lingo.find(type, @opts[key]) { quit }
129
129
  instance_variable_set("@#{type}_file", file)
130
130
 
131
- File.open(file, encoding: ENC) { |f| @opts.update(SafeYAML.load(f)) }
131
+ File.open(file, encoding: ENCODING) { |f|
132
+ @opts.update(SafeYAML.load(f)) }
132
133
  rescue Psych::SyntaxError => err
133
134
  err.message << " (in #{file})"
134
135
  raise
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -31,6 +31,7 @@ class Lingo
31
31
  module Ctl
32
32
 
33
33
  extend self
34
+ extend TextUtils
34
35
 
35
36
  PROG, VERSION, OPTWIDTH = $0, '0.0.3', 21
36
37
  PROGNAME, OPTIONS = File.basename(PROG), {}
@@ -120,25 +121,6 @@ Usage: #{PROG} <command> [arguments] [options]
120
121
  usage('Too many arguments.') unless ARGV.empty?
121
122
  end
122
123
 
123
- def overwrite?(target, unlink = false)
124
- !File.exist?(target) || if agree?("#{target} already exists. Overwrite?")
125
- File.unlink(target) if unlink
126
- true
127
- end
128
- end
129
-
130
- def agree?(msg)
131
- print "#{msg} (y/n) [n]: "
132
-
133
- case answer = $stdin.gets.chomp
134
- when /\Ano?\z/i, '' then nil
135
- when /\Ay(?:es)?\z/i then true
136
- else puts 'Please enter "yes" or "no".'; agree?(msg)
137
- end
138
- rescue Interrupt
139
- abort ''
140
- end
141
-
142
124
  end
143
125
 
144
126
  def self.ctl
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -24,8 +24,6 @@
24
24
  ###############################################################################
25
25
  #++
26
26
 
27
- require 'csv'
28
-
29
27
  class Lingo
30
28
 
31
29
  module Ctl
@@ -97,11 +95,11 @@ class Lingo
97
95
  File.basename(path.chomp(File.extname(path))) }.uniq.join('-'))
98
96
 
99
97
  lambda { |key, &block| overwrite?(file = "#{name}.#{key}.csv") &&
100
- puts("#{file}: #{Array(CSV.open(file, 'wb', &block)).join(' / ')}") }
98
+ puts("#{file}: #{Array(open_csv(file, 'wb', &block)).join(' / ')}") }
101
99
  end
102
100
 
103
101
  def csv_foreach(paths)
104
- paths.each { |path| CSV.foreach(path, headers: true) { |row|
102
+ paths.each { |path| foreach_csv(path, headers: true) { |row|
105
103
  yield path, *row.values_at(*%w[string token word pattern]) } }
106
104
  end
107
105
 
@@ -76,9 +76,9 @@ class Lingo
76
76
  FileUtils.cp(source, target, verbose: true)
77
77
  end
78
78
 
79
- def do_clearstore
80
- store = Dir["#{find(:store, false)}.*"]
81
- FileUtils.rm(store, verbose: true) unless store.empty?
79
+ def clear(what)
80
+ target = Dir["#{find(what, false)}.*"]
81
+ FileUtils.rm(target, verbose: true) unless target.empty?
82
82
  end
83
83
 
84
84
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -45,6 +45,8 @@ class Lingo
45
45
  KEY_REF = '*'
46
46
  SYS_KEY = '~'
47
47
 
48
+ KEY_REF_RE = %r{\A#{Regexp.escape(KEY_REF)}(\d+)\z}o
49
+
48
50
  BACKENDS = []
49
51
  BACKEND_BY_EXT = {}
50
52
 
@@ -58,6 +60,25 @@ class Lingo
58
60
  klass.class_eval('def store_ext; EXT; end', __FILE__, __LINE__) if meth
59
61
  end
60
62
 
63
+ def backend_by_ext(file, ext = File.extname(file))
64
+ get_backend(BACKEND_BY_EXT[ext], file) or
65
+ raise BackendNotFoundError.new(file)
66
+ end
67
+
68
+ def find_backend(env = 'LINGO_BACKEND')
69
+ env && get_backend(ENV[env]) || BACKENDS.find { |name|
70
+ backend = get_backend(name, nil, true) and return backend }
71
+ end
72
+
73
+ def get_backend(name, file = nil, relax = false)
74
+ return unless name
75
+
76
+ Object.const_get(name)
77
+ const_get("#{name}Store")
78
+ rescue TypeError, NameError => err
79
+ raise BackendNotAvailableError.new(name, file, err) unless relax
80
+ end
81
+
61
82
  def open(*args, &block)
62
83
  new(*args).open(&block)
63
84
  end
@@ -75,23 +96,12 @@ class Lingo
75
96
  FileUtils.mkdir_p(File.dirname(@stofile))
76
97
  rescue SourceFileNotFoundError => err
77
98
  @stofile = skip_ext = err.id
78
-
79
- unless err.name
80
- if name = BACKEND_BY_EXT[File.extname(@stofile)]
81
- backend = get_backend(name, @stofile)
82
- else
83
- raise BackendNotFoundError.new(@stofile)
84
- end
85
- end
99
+ backend = self.class.backend_by_ext(@stofile) unless err.name
86
100
  rescue NoWritableStoreError
87
101
  backend = HashStore
88
102
  end
89
103
 
90
- unless backend ||= get_backend(ENV['LINGO_BACKEND'])
91
- BACKENDS.find { |name| backend = get_backend(name, nil, true) }
92
- end
93
-
94
- extend(@backend = backend || HashStore)
104
+ extend(@backend = backend || self.class.find_backend || HashStore)
95
105
 
96
106
  @stofile << store_ext unless skip_ext || !respond_to?(:store_ext)
97
107
 
@@ -151,15 +161,6 @@ class Lingo
151
161
 
152
162
  private
153
163
 
154
- def get_backend(name, file = nil, relax = false)
155
- return unless name
156
-
157
- Object.const_get(name)
158
- self.class.const_get("#{name}Store")
159
- rescue TypeError, NameError => err
160
- raise BackendNotAvailableError.new(name, file, err) unless relax
161
- end
162
-
163
164
  def config_hash
164
165
  hashes = [config]
165
166
 
@@ -228,11 +229,11 @@ class Lingo
228
229
  end
229
230
 
230
231
  def _encode!(str)
231
- str.force_encoding(ENC)
232
+ str.force_encoding(ENCODING)
232
233
  end
233
234
 
234
235
  def convert(verbose = lingo.config.stderr.tty?)
235
- src = Source.get(config.fetch('txt-format', 'key_value'), @id, lingo)
236
+ src = Source.from_config(config, @id)
236
237
 
237
238
  sep, hyphenate, key_map, val_map = prepare_lex
238
239
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2015 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -35,23 +35,27 @@ class Lingo
35
35
 
36
36
  extend self
37
37
 
38
+ KEYLEN = 16
39
+
40
+ CIPHER = 'AES-128-CBC'.freeze
41
+
38
42
  def digest(key)
39
43
  Digest::SHA1.hexdigest(key)
40
44
  end
41
45
 
42
46
  def encode(key, val)
43
- [digest(key), crypt(:encrypt, key, val)]
47
+ [digest = digest(key), crypt(:encrypt, key, val, digest)]
44
48
  end
45
49
 
46
50
  def decode(key, val)
47
- crypt(:decrypt, key, val).force_encoding(ENC)
51
+ crypt(:decrypt, key, val, digest(key)).force_encoding(ENCODING)
48
52
  end
49
53
 
50
54
  private
51
55
 
52
- def crypt(method, key, val)
53
- cipher = OpenSSL::Cipher.new('aes-128-cbc').send(method)
54
- cipher.iv = cipher.key = digest(key)
56
+ def crypt(method, key, val, digest)
57
+ cipher = OpenSSL::Cipher.new(CIPHER).send(method)
58
+ cipher.iv = cipher.key = digest[0, KEYLEN]
55
59
  cipher.update(val) + cipher.final
56
60
  end
57
61
 
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -49,30 +49,57 @@ class Lingo
49
49
 
50
50
  class Source
51
51
 
52
- def self.get(name, *args)
53
- Lingo.get_const(name, self).new(*args)
52
+ LEXICAL_SEPARATOR = '#'.freeze
53
+
54
+ DEFAULT_SEPARATOR = nil
55
+
56
+ DEFAULT_DEF_WC = nil
57
+
58
+ MAX_LENGTH = 4096
59
+
60
+ class << self
61
+
62
+ def from_id(id, lingo)
63
+ from_config(lingo.database_config(id), id)
64
+ end
65
+
66
+ def from_config(config, id = nil)
67
+ format = config.fetch('txt-format', 'key_value')
68
+ Lingo.get_const(format, self).new(config['name'], config, id)
69
+ end
70
+
71
+ def lexicals(val, sep = LEXICAL_SEPARATOR, ref = KEY_REF_RE)
72
+ val.map { |str|
73
+ str =~ ref ? $1.to_i : begin
74
+ k, *w = str.split(sep)
75
+ Language::Lexical.new(k.strip, w)
76
+ end
77
+ }.uniq if val
78
+ end
79
+
54
80
  end
55
81
 
56
82
  attr_reader :pos
57
83
 
58
- def initialize(id, lingo, def_wc_default = nil)
59
- @config = lingo.database_config(id)
84
+ def initialize(name = nil, config = {}, id = nil)
85
+ @config = config
60
86
 
61
- source_file = Lingo.find(:dict, name = @config['name'], relax: true)
87
+ src_file = Lingo.find(:dict, name, relax: true) if name
62
88
 
63
- reject_file = begin
64
- Lingo.find(:store, source_file) << '.rev'
89
+ rej_file = begin
90
+ Lingo.find(:store, src_file) << '.rev'
65
91
  rescue NoWritableStoreError, SourceFileNotFoundError
66
- end
92
+ end if id && src_file
67
93
 
68
- @src = Pathname.new(source_file)
69
- @rej = Pathname.new(reject_file) if reject_file
94
+ @src = Pathname.new(src_file) if src_file
95
+ @rej = Pathname.new(rej_file) if rej_file
70
96
 
71
- raise SourceFileNotFoundError.new(name, id) unless @src.exist?
97
+ raise id ? SourceFileNotFoundError.new(name, id) :
98
+ FileNotFoundError.new(name) if name && !@src.exist?
72
99
 
73
- @def = @config.fetch('def-wc', def_wc_default)
100
+ @sep = config.fetch('separator', self.class::DEFAULT_SEPARATOR)
101
+ @def = config.fetch('def-wc', self.class::DEFAULT_DEF_WC)
74
102
  @def = @def.downcase if @def
75
- @sep = @config['separator']
76
103
 
77
104
  @wrd = "(?:#{Language::Char::ANY})+"
78
105
  @pat = /^#{@wrd}$/
@@ -85,33 +112,47 @@ class Lingo
85
112
  end
86
113
 
87
114
  def each
88
- reject_file = @rej.open('w', encoding: ENC) if @rej
115
+ return enum_for(__method__) unless block_given?
116
+ each_line { |line, key, val| yield parse_line(line, key, val) }
117
+ end
118
+
119
+ def each_line
120
+ return enum_for(__method__) unless block_given?
121
+
122
+ rej_file = @rej.open('w', encoding: ENCODING) if @rej
89
123
 
90
- @src.each_line($/, encoding: ENC) { |line|
124
+ @src.each_line($/, encoding: ENCODING) { |line|
91
125
  @pos += length = line.bytesize
92
126
 
93
127
  line.strip!
94
128
  next if line.empty? || line.start_with?('#')
95
129
 
96
- line.chomp!
97
- line.replace(Unicode.downcase(line))
98
-
99
- if length < 4096 && line =~ @pat
100
- yield convert_line(line, $1, $2)
130
+ if length < MAX_LENGTH && line.replace(Unicode.downcase(line)) =~ @pat
131
+ yield line, $1, $2
101
132
  else
102
133
  @rej_cnt += 1
103
- reject_file.puts(line) if reject_file
134
+ rej_file.puts(line) if rej_file
104
135
  end
105
136
  }
106
137
 
107
138
  self
108
139
  ensure
109
- if reject_file
110
- reject_file.close
111
- @rej.delete if @rej.size == 0
140
+ if rej_file
141
+ rej_file.close
142
+ @rej.delete if @rej.size.zero?
112
143
  end
113
144
  end
114
145
 
146
+ def each_lexical
147
+ return enum_for(__method__) unless block_given?
148
+ each { |key, val| yield key, self.class.lexicals(val) }
149
+ end
150
+
151
+ def each_dump(*args)
152
+ return enum_for(__method__, *args) unless block_given?
153
+ each_lexical { |key, val| yield dump_line(key, val, *args) }
154
+ end
155
+
115
156
  def set(db, key, val)
116
157
  db[key] = val
117
158
  end
@@ -120,6 +161,12 @@ class Lingo
120
161
  [@rej_cnt, @rej]
121
162
  end
122
163
 
164
+ private
165
+
166
+ def lexical(*args)
167
+ args.join(LEXICAL_SEPARATOR)
168
+ end
169
+
123
170
  end
124
171
 
125
172
  end
@@ -6,7 +6,7 @@
6
6
  # Lingo -- A full-featured automatic indexing system #
7
7
  # #
8
8
  # Copyright (C) 2005-2007 John Vorhauer #
9
- # Copyright (C) 2007-2014 John Vorhauer, Jens Wille #
9
+ # Copyright (C) 2007-2016 John Vorhauer, Jens Wille #
10
10
  # #
11
11
  # Lingo is free software; you can redistribute it and/or modify it under the #
12
12
  # terms of the GNU Affero General Public License as published by the Free #
@@ -39,17 +39,21 @@ class Lingo
39
39
 
40
40
  class KeyValue < self
41
41
 
42
- DEFAULT_SEPARATOR = '*'
42
+ DEFAULT_SEPARATOR = '*'.freeze
43
43
 
44
- def initialize(id, lingo)
45
- super(id, lingo, Language::LA_UNKNOWN)
46
- @pat = /^(#{@wrd})#{Regexp.escape(@sep ||= DEFAULT_SEPARATOR)}(#{@wrd})$/
44
+ DEFAULT_DEF_WC = Language::LA_UNKNOWN
45
+
46
+ def initialize(*)
47
+ super
48
+ @pat = /^(#{@wrd})#{Regexp.escape(@sep)}(#{@wrd})$/
47
49
  end
48
50
 
49
- private
51
+ def parse_line(line, key, val)
52
+ [key.strip, [lexical(val.strip, @def)]]
53
+ end
50
54
 
51
- def convert_line(line, key, val)
52
- [key.strip, %W[#{val.strip}##{@def}]]
55
+ def dump_line(key, val, sep = @sep, *)
56
+ val.map(&:form).unshift(key).join(sep)
53
57
  end
54
58
 
55
59
  end