lingo 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/COPYING +663 -0
- data/ChangeLog +754 -0
- data/README +322 -0
- data/Rakefile +100 -0
- data/TODO +28 -0
- data/bin/lingo +5 -0
- data/bin/lingoctl +6 -0
- data/de.lang +121 -0
- data/de/lingo-abk.txt +74 -0
- data/de/lingo-dic.txt +56822 -0
- data/de/lingo-mul.txt +3209 -0
- data/de/lingo-syn.txt +14841 -0
- data/de/test_dic.txt +24 -0
- data/de/test_mul.txt +17 -0
- data/de/test_mul2.txt +2 -0
- data/de/test_singleword.txt +2 -0
- data/de/test_syn.txt +4 -0
- data/de/test_syn2.txt +1 -0
- data/de/user-dic.txt +10 -0
- data/en.lang +113 -0
- data/en/lingo-dic.txt +55434 -0
- data/en/lingo-mul.txt +456 -0
- data/en/user-dic.txt +5 -0
- data/info/Objekte.png +0 -0
- data/info/Typen.png +0 -0
- data/info/database.png +0 -0
- data/info/db_small.png +0 -0
- data/info/download.png +0 -0
- data/info/gpl-hdr.txt +27 -0
- data/info/kerze.png +0 -0
- data/info/language.png +0 -0
- data/info/lingo.png +0 -0
- data/info/logo.png +0 -0
- data/info/meeting.png +0 -0
- data/info/types.png +0 -0
- data/lib/lingo.rb +321 -0
- data/lib/lingo/attendee/abbreviator.rb +119 -0
- data/lib/lingo/attendee/debugger.rb +111 -0
- data/lib/lingo/attendee/decomposer.rb +101 -0
- data/lib/lingo/attendee/dehyphenizer.rb +167 -0
- data/lib/lingo/attendee/multiworder.rb +301 -0
- data/lib/lingo/attendee/noneword_filter.rb +103 -0
- data/lib/lingo/attendee/objectfilter.rb +86 -0
- data/lib/lingo/attendee/sequencer.rb +190 -0
- data/lib/lingo/attendee/synonymer.rb +105 -0
- data/lib/lingo/attendee/textreader.rb +237 -0
- data/lib/lingo/attendee/textwriter.rb +196 -0
- data/lib/lingo/attendee/tokenizer.rb +218 -0
- data/lib/lingo/attendee/variator.rb +185 -0
- data/lib/lingo/attendee/vector_filter.rb +158 -0
- data/lib/lingo/attendee/wordsearcher.rb +96 -0
- data/lib/lingo/attendees.rb +289 -0
- data/lib/lingo/cli.rb +62 -0
- data/lib/lingo/config.rb +104 -0
- data/lib/lingo/const.rb +131 -0
- data/lib/lingo/ctl.rb +173 -0
- data/lib/lingo/database.rb +587 -0
- data/lib/lingo/language.rb +530 -0
- data/lib/lingo/modules.rb +98 -0
- data/lib/lingo/types.rb +285 -0
- data/lib/lingo/utilities.rb +40 -0
- data/lib/lingo/version.rb +27 -0
- data/lingo-all.cfg +85 -0
- data/lingo-call.cfg +15 -0
- data/lingo.cfg +78 -0
- data/lingo.rb +3 -0
- data/lir.cfg +72 -0
- data/porter/stem.cfg +311 -0
- data/porter/stem.rb +150 -0
- data/spec/spec_helper.rb +0 -0
- data/test.cfg +79 -0
- data/test/attendee/ts_abbreviator.rb +35 -0
- data/test/attendee/ts_decomposer.rb +31 -0
- data/test/attendee/ts_multiworder.rb +390 -0
- data/test/attendee/ts_noneword_filter.rb +19 -0
- data/test/attendee/ts_objectfilter.rb +19 -0
- data/test/attendee/ts_sequencer.rb +43 -0
- data/test/attendee/ts_synonymer.rb +33 -0
- data/test/attendee/ts_textreader.rb +58 -0
- data/test/attendee/ts_textwriter.rb +98 -0
- data/test/attendee/ts_tokenizer.rb +32 -0
- data/test/attendee/ts_variator.rb +24 -0
- data/test/attendee/ts_vector_filter.rb +62 -0
- data/test/attendee/ts_wordsearcher.rb +119 -0
- data/test/lir.csv +3 -0
- data/test/lir.txt +12 -0
- data/test/lir2.txt +12 -0
- data/test/mul.txt +1 -0
- data/test/ref/artikel.mul +1 -0
- data/test/ref/artikel.non +159 -0
- data/test/ref/artikel.seq +270 -0
- data/test/ref/artikel.syn +16 -0
- data/test/ref/artikel.vec +928 -0
- data/test/ref/artikel.ven +928 -0
- data/test/ref/artikel.ver +928 -0
- data/test/ref/lir.csv +328 -0
- data/test/ref/lir.mul +1 -0
- data/test/ref/lir.non +274 -0
- data/test/ref/lir.seq +249 -0
- data/test/ref/lir.syn +94 -0
- data/test/test_helper.rb +113 -0
- data/test/ts_database.rb +269 -0
- data/test/ts_language.rb +396 -0
- data/txt/artikel-en.txt +157 -0
- data/txt/artikel.txt +170 -0
- data/txt/lir.txt +1317 -0
- metadata +211 -0
data/README
ADDED
@@ -0,0 +1,322 @@
|
|
1
|
+
= Lingo - A full-featured automatic indexing system
|
2
|
+
|
3
|
+
<b></b>
|
4
|
+
* {Version}[rdoc-label:label-VERSION]
|
5
|
+
* {Description}[rdoc-label:label-DESCRIPTION]
|
6
|
+
* {Introduction}[rdoc-label:label-Introduction]
|
7
|
+
* {Attendees}[rdoc-label:label-Attendees]
|
8
|
+
* {Filters}[rdoc-label:label-Filters]
|
9
|
+
* {Markup}[rdoc-label:label-Markup]
|
10
|
+
* {Inline annotation}[rdoc-label:label-Inline+annotation]
|
11
|
+
* {Plugins}[rdoc-label:label-Plugins]
|
12
|
+
* {Example}[rdoc-label:label-EXAMPLE]
|
13
|
+
* {Installation and Usage}[rdoc-label:label-INSTALLATION+AND+USAGE]
|
14
|
+
* {Dictionary and configuration file lookup}[rdoc-label:label-Dictionary+and+configuration+file+lookup]
|
15
|
+
* {Legacy version}[rdoc-label:label-Legacy+version]
|
16
|
+
* {File formats}[rdoc-label:label-FILE+FORMATS]
|
17
|
+
* {Configuration}[rdoc-label:label-Configuration]
|
18
|
+
* {Language definition}[rdoc-label:label-Language+definition]
|
19
|
+
* {Dictionaries}[rdoc-label:label-Dictionaries]
|
20
|
+
* {Issues and Contributions}[rdoc-label:label-ISSUES+AND+CONTRIBUTIONS]
|
21
|
+
* {Links}[rdoc-label:label-LINKS]
|
22
|
+
* {Credits}[rdoc-label:label-CREDITS]
|
23
|
+
* {License and Copyright}[rdoc-label:label-LICENSE+AND+COPYRIGHT]
|
24
|
+
|
25
|
+
== VERSION
|
26
|
+
|
27
|
+
This documentation refers to Lingo version 1.8.0
|
28
|
+
|
29
|
+
|
30
|
+
== DESCRIPTION
|
31
|
+
|
32
|
+
Lingo is an open source indexing system for research and teachings. The main
|
33
|
+
functions of Lingo are:
|
34
|
+
|
35
|
+
* identification of (i.e. reduction to) basic word form by means of dictionaries
|
36
|
+
and suffix lists
|
37
|
+
* algorithmic decomposition
|
38
|
+
* dictionary-based synonymisation and identification of phrases
|
39
|
+
* generic identification of phrases/word sequences based on patterns of word
|
40
|
+
classes
|
41
|
+
|
42
|
+
=== Introduction
|
43
|
+
|
44
|
+
If you want to perform linguistic analysis on some text, Lingo is there to
|
45
|
+
support your endeavour with all its flexibility and extendability. Lingo
|
46
|
+
enables you to assemble a network of practically unlimited functionality
|
47
|
+
from modules with limited functions. This network is built by configuration
|
48
|
+
files. Here's a minimal example:
|
49
|
+
|
50
|
+
meeting:
|
51
|
+
attendees:
|
52
|
+
- textreader: { files: 'README' }
|
53
|
+
- debugger: { eval: 'true', ceval: 'cmd!="EOL"', prompt: '<debug>: ' }
|
54
|
+
|
55
|
+
Lingo is told to invite two attendees. And Lingo wants them to talk to each
|
56
|
+
other, hence the name Lingo (= the technical language).
|
57
|
+
|
58
|
+
The first attendee is the +textreader+ (Lingo::Attendee::Textreader). It can
|
59
|
+
read files (as well as standard input) and communicate its content to other
|
60
|
+
attendees. For this purpose the +textreader+ is given an output channel.
|
61
|
+
Everything that the +textreader+ has to say is steered through this channel.
|
62
|
+
It will do nothing further until Lingo will tell the first attendee to speak.
|
63
|
+
Then the +textreader+ will open the file +README+ (<tt>files</tt> parameter)
|
64
|
+
and babble the content to the world via its output channel.
|
65
|
+
|
66
|
+
The second attendee +debugger+ (Lingo::Attendee::Debugger) does nothing else
|
67
|
+
than to put everything on the console (standard error, actually) that comes
|
68
|
+
into its input channel. If you write the Lingo configuration which is shown
|
69
|
+
above as an example into the file readme.cfg and then run <tt>lingo -c readme
|
70
|
+
-l en</tt>, the result will look something like this:
|
71
|
+
|
72
|
+
<debug>: *FILE('README')
|
73
|
+
<debug>: "= Lingo - [...]"
|
74
|
+
...
|
75
|
+
<debug>: "If you want to perform linguistic analysis on some text, [...]"
|
76
|
+
<debug>: "support your endeavour with all its flexibility and [...]"
|
77
|
+
...
|
78
|
+
<debug>: *EOF('README')
|
79
|
+
|
80
|
+
What we see are lines with an asterisk (*) and lines without. That's because
|
81
|
+
Lingo distinguishes between commands and data. The +textreader+ did not only
|
82
|
+
read the content of the file, but also communicated through the commands when
|
83
|
+
a file begins and when it ends. This can (and will) be an important piece of
|
84
|
+
information for other attendees that will be added later.
|
85
|
+
|
86
|
+
To try out Lingo's functionality without installing it first, have a look at
|
87
|
+
{Lingo Web}[http://linux2.fbi.fh-koeln.de/lingoweb]. There you can enter some
|
88
|
+
text and see the debug output Lingo generated, including tokenization, word
|
89
|
+
identification, decomposition, etc.
|
90
|
+
|
91
|
+
=== Attendees
|
92
|
+
|
93
|
+
Available attendees that can be used for solving a specific problem (for more
|
94
|
+
information see each attendee's documentation):
|
95
|
+
|
96
|
+
<tt>textreader</tt>:: Reads files and puts their content into the channels line by
|
97
|
+
line. (Lingo::Attendee::Textreader)
|
98
|
+
<tt>tokenizer</tt>:: Dissects lines into defined character strings, i.e. tokens.
|
99
|
+
(Lingo::Attendee::Tokenizer)
|
100
|
+
<tt>abbreviator</tt>:: Identifies abbreviations and produces the long form if listed
|
101
|
+
in a dictionary. (Lingo::Attendee::Abbreviator)
|
102
|
+
<tt>wordsearcher</tt>:: Identifies tokens and turns them into words for further
|
103
|
+
processing. To do this right it looks into the dictionary.
|
104
|
+
(Lingo::Attendee::Wordsearcher)
|
105
|
+
<tt>decomposer</tt>:: Tests any character strings not identified by the +wordsearcher+
|
106
|
+
for being compounds. (Lingo::Attendee::Decomposer)
|
107
|
+
<tt>synonymer</tt>:: Extends words with synonyms. (Lingo::Attendee::Synonymer)
|
108
|
+
<tt>noneword_filter</tt>:: Filters out everything and lets through only those tokens that
|
109
|
+
are unknown. (Lingo::Attendee::Noneword_filter)
|
110
|
+
<tt>vector_filter</tt>:: Filters out everything and lets through only those tokens that are
|
111
|
+
considered useful for indexing. (Lingo::Attendee::Vector_filter)
|
112
|
+
<tt>objectfilter</tt>:: Similar to the +vector_filter+. (Lingo::Attendee::Objectfilter)
|
113
|
+
<tt>textwriter</tt>:: Writes anything that it receives into a file (or to standard
|
114
|
+
output). (Lingo::Attendee::Textwriter)
|
115
|
+
<tt>formatter</tt>:: Similar to the +textwriter+, but allows for custom output formats.
|
116
|
+
(Lingo::Attendee::Formatter)
|
117
|
+
<tt>debugger</tt>:: Shows everything for debugging. (Lingo::Attendee::Debugger)
|
118
|
+
<tt>variator</tt>:: Tries to correct spelling errors and the like.
|
119
|
+
(Lingo::Attendee::Variator)
|
120
|
+
<tt>dehyphenizer</tt>:: Tries to undo hyphenation. (Lingo::Attendee::Dehyphenizer)
|
121
|
+
<tt>multiworder</tt>:: Identifies phrases (word sequences) based on a multiword
|
122
|
+
dictionary. (Lingo::Attendee::Multiworder)
|
123
|
+
<tt>sequencer</tt>:: Identifies phrases (word sequences) based on patterns of word
|
124
|
+
classes. (Lingo::Attendee::Sequencer)
|
125
|
+
|
126
|
+
Furthermore, it may be useful to have a look at the configuration files
|
127
|
+
<tt>lingo.cfg</tt> and <tt>en.lang</tt>.
|
128
|
+
|
129
|
+
=== Filters
|
130
|
+
|
131
|
+
Lingo is able to read HTML, XML, and PDF.
|
132
|
+
|
133
|
+
TODO: Examples.
|
134
|
+
|
135
|
+
=== Markup
|
136
|
+
|
137
|
+
Lingo is able to parse HTML/XML and MediaWiki markup.
|
138
|
+
|
139
|
+
TODO: Examples.
|
140
|
+
|
141
|
+
=== Inline annotation
|
142
|
+
|
143
|
+
Lingo is able to annotate input text inline, instead of printing results to
|
144
|
+
external files.
|
145
|
+
|
146
|
+
TODO: Examples.
|
147
|
+
|
148
|
+
=== Plugins
|
149
|
+
|
150
|
+
Lingo has a plugin system that allows you to implement additional features
|
151
|
+
(e.g. add new attendees) or modify existing ones. Just create a file named
|
152
|
+
+lingo_plugin.rb+ in your Gem's +lib+ directory or any directory that's in
|
153
|
+
<tt>$LOAD_PATH</tt>. You can also define an environment variable +LINGO_PLUGIN_PATH+
|
154
|
+
with additional directories to load plugins from (<tt>*.rb</tt>).
|
155
|
+
|
156
|
+
A dedicated API to support writing and integrating plugins will be added in
|
157
|
+
the future.
|
158
|
+
|
159
|
+
|
160
|
+
== EXAMPLE
|
161
|
+
|
162
|
+
TODO: Full-fledged example to show off Lingo's features and provide a basis
|
163
|
+
for further discussion.
|
164
|
+
|
165
|
+
|
166
|
+
== INSTALLATION AND USAGE
|
167
|
+
|
168
|
+
Since version 1.8.0, Lingo is available as a RubyGem. So a simple <tt>gem
|
169
|
+
install lingo</tt> will install Lingo and its dependencies (you might want
|
170
|
+
to run that command with administrator privileges, depending on your
|
171
|
+
environment). Then you can call the +lingo+ executable to process your
|
172
|
+
data. See <tt>lingo --help</tt> for starters.
|
173
|
+
|
174
|
+
Please note that Lingo requires Ruby version 1.9 to run
|
175
|
+
(1.9.3[http://ruby-lang.org/en/downloads/] is the currently recommended
|
176
|
+
version). If you want to use Lingo on Ruby 1.8, please refer to the legacy
|
177
|
+
version (see below).
|
178
|
+
|
179
|
+
Prior to version 1.8.0, Lingo expected to be run from its installation
|
180
|
+
directory. This is no longer necessary. But if you prefer that use case,
|
181
|
+
you can either download and extract an
|
182
|
+
{archive file}[http://github.com/lex-lingo/lingo/tags] or unpack the
|
183
|
+
Gem archive (<tt>gem unpack lingo</tt>); or you can install the legacy
|
184
|
+
version of Lingo (see below).
|
185
|
+
|
186
|
+
=== Dictionary and configuration file lookup
|
187
|
+
|
188
|
+
Lingo will search different locations to find dictionaries and configuration
|
189
|
+
files. By default, these are the current directory, your personal Lingo
|
190
|
+
directory (<tt>~/.lingo</tt>) and the installation directory (in that order).
|
191
|
+
You can control this lookup path by either moving files up the chain (using
|
192
|
+
the +lingoctl+ executable) or by setting various environment variables.
|
193
|
+
|
194
|
+
With +lingoctl+ you can copy dictionaries and configuration files from your
|
195
|
+
personal Lingo directory or the installation directory to the current
|
196
|
+
directory so you can modify them and they will take precedence over the
|
197
|
+
original ones. See <tt>lingoctl --help</tt> for usage information.
|
198
|
+
|
199
|
+
In order to change the search path in itself, you can define the
|
200
|
+
+LINGO_PATH+ environment variable as a whole or its individual parts
|
201
|
+
+LINGO_CURR+ (the local Lingo directory), +LINGO_HOME+ (your personal
|
202
|
+
Lingo directory), and +LINGO_BASE+ (the system-wide Lingo directory).
|
203
|
+
|
204
|
+
Inside of any of these directories dictionaries and configuration files are
|
205
|
+
typically organized in the following directory structure:
|
206
|
+
|
207
|
+
<tt>config</tt>:: Configuration files (<tt>*.cfg</tt>).
|
208
|
+
<tt>dict</tt>:: Dictionary source files (<tt>*.txt</tt>); in
|
209
|
+
language-specific subdirectories (+de+, +en+, ...).
|
210
|
+
<tt>lang</tt>:: Language definition files (<tt>*.lang</tt>).
|
211
|
+
<tt>store</tt>:: Compiled dictionaries, generated from source files.
|
212
|
+
|
213
|
+
But for compatibility reasons these naming conventions are not enforced.
|
214
|
+
|
215
|
+
=== Legacy version
|
216
|
+
|
217
|
+
As Lingo 1.8 introduced some major disruptions and no longer runs on Ruby 1.8,
|
218
|
+
there is a maintenance branch for Lingo 1.7.x that will remain compatible with
|
219
|
+
both Ruby 1.8 and the previous line of Lingo prior to 1.8. This branch will
|
220
|
+
receive occasional bug fixes and minor feature updates. However, the bulk of
|
221
|
+
the development efforts will be directed towards Lingo 1.8+.
|
222
|
+
|
223
|
+
To install the legacy version, download and extract the ZIP archive from
|
224
|
+
RubyForge[http://rubyforge.org/frs/?group_id=5663]. No additional dependencies
|
225
|
+
are required. This version of Lingo works with both Ruby 1.8 (1.8.5 or greater)
|
226
|
+
and 1.9.
|
227
|
+
|
228
|
+
The executable is named +lingo.rb+. It's located at the root of the installation
|
229
|
+
directory and may only be run from there. See <tt>ruby lingo.rb -h</tt> for
|
230
|
+
usage instructions.
|
231
|
+
|
232
|
+
Configuration and language definition files are also located at the root of the
|
233
|
+
installation directory (<tt>*.cfg</tt> and <tt>*.lang</tt>, respectively).
|
234
|
+
Dictionary source files are found in language-specific subdirectories (+de+,
|
235
|
+
+en+, ...) and are named <tt>*.txt</tt>. The compiled dictionaries are found
|
236
|
+
beneath these subdirectories in a directory named <tt>store</tt>.
|
237
|
+
|
238
|
+
|
239
|
+
== FILE FORMATS
|
240
|
+
|
241
|
+
Lingo uses three different types of files to determine its behaviour.
|
242
|
+
Configuration files control the details of the indexing process. Language
|
243
|
+
definitions specify grammar rules and dictionaries available for indexing.
|
244
|
+
Dictionaries, finally, hold the vocabulary used in indexing the input text
|
245
|
+
and producing the results.
|
246
|
+
|
247
|
+
=== Configuration
|
248
|
+
|
249
|
+
TODO...
|
250
|
+
|
251
|
+
=== Language definition
|
252
|
+
|
253
|
+
TODO...
|
254
|
+
|
255
|
+
=== Dictionaries
|
256
|
+
|
257
|
+
TODO...
|
258
|
+
|
259
|
+
|
260
|
+
== ISSUES AND CONTRIBUTIONS
|
261
|
+
|
262
|
+
If you find bugs or want to suggest new features, please write to the
|
263
|
+
{mailing list}[mailto:lingo-users@rubyforge.org] or report them on
|
264
|
+
GitHub[http://github.com/lex-lingo/lingo/issues]. Include your Ruby
|
265
|
+
version (<tt>ruby --version</tt>) and the version of Lingo you are using
|
266
|
+
(typically <tt>lingo --version</tt>, provided it's new enough to support
|
267
|
+
that flag).
|
268
|
+
|
269
|
+
If you want to contribute to Lingo, please fork the project on
|
270
|
+
GitHub[http://github.com/lex-lingo/lingo] and submit a
|
271
|
+
{pull request}[http://github.com/lex-lingo/lingo/pulls] (bonus points for
|
272
|
+
topic branches) or clone the repository[http://github.com/lex-lingo/lingo]
|
273
|
+
locally and send your formatted patch to the
|
274
|
+
{developer list}[mailto:lingo-core@rubyforge.org].
|
275
|
+
|
276
|
+
|
277
|
+
== LINKS
|
278
|
+
|
279
|
+
<b></b>
|
280
|
+
Website:: http://lex-lingo.de
|
281
|
+
Demo:: http://linux2.fbi.fh-koeln.de/lingoweb
|
282
|
+
Documentation:: http://lex-lingo.github.com/lingo
|
283
|
+
Source code:: http://github.com/lex-lingo/lingo
|
284
|
+
RubyGem:: http://rubygems.org/gems/lingo
|
285
|
+
RubyForge project:: http://rubyforge.org/projects/lingo
|
286
|
+
Mailing list:: http://rubyforge.org/mailman/listinfo/lingo-users
|
287
|
+
Bug tracker:: http://github.com/lex-lingo/lingo/issues
|
288
|
+
|
289
|
+
|
290
|
+
== CREDITS
|
291
|
+
|
292
|
+
Lingo is based on a collective development by Klaus Lepsky and John Vorhauer.
|
293
|
+
|
294
|
+
=== Authors
|
295
|
+
|
296
|
+
* John Vorhauer <mailto:lingo@vorhauer.de>
|
297
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
298
|
+
|
299
|
+
=== Contributors
|
300
|
+
|
301
|
+
* Klaus Lepsky <mailto:klaus@lepsky.de>
|
302
|
+
* Jan-Helge Jacobs <mailto:plancton@web.de>
|
303
|
+
* Thomas Müller <mailto:thomas.mueller@fh-koeln.de>
|
304
|
+
|
305
|
+
|
306
|
+
== LICENSE AND COPYRIGHT
|
307
|
+
|
308
|
+
Copyright (C) 2005-2007 John Vorhauer
|
309
|
+
Copyright (C) 2007-2012 John Vorhauer, Jens Wille
|
310
|
+
|
311
|
+
Lingo is free software: you can redistribute it and/or modify it under the
|
312
|
+
terms of the GNU Affero General Public License as published by the Free
|
313
|
+
Software Foundation, either version 3 of the License, or (at your option)
|
314
|
+
any later version.
|
315
|
+
|
316
|
+
Lingo is distributed in the hope that it will be useful, but WITHOUT ANY
|
317
|
+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
318
|
+
FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
319
|
+
details.
|
320
|
+
|
321
|
+
You should have received a copy of the GNU Affero General Public License along
|
322
|
+
with Lingo. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
__DIR__ = File.expand_path('..', __FILE__)
|
4
|
+
|
5
|
+
require 'rake/clean'
|
6
|
+
require File.join(__DIR__, %w[lib lingo version])
|
7
|
+
|
8
|
+
PACKAGE_NAME = 'lingo'
|
9
|
+
PACKAGE_PATH = File.join(__DIR__, 'pkg', "#{PACKAGE_NAME}-#{Lingo::VERSION}")
|
10
|
+
|
11
|
+
if RUBY_PLATFORM =~ /msdos|mswin|djgpp|mingw|windows/i
|
12
|
+
ZIP_COMMANDS = ['zip', '7z a'] # for hen's gem task
|
13
|
+
end
|
14
|
+
|
15
|
+
task default: :spec
|
16
|
+
task package: [:checkdoc, 'test:all', :clean]
|
17
|
+
|
18
|
+
begin
|
19
|
+
require 'hen'
|
20
|
+
|
21
|
+
Hen.lay! {{
|
22
|
+
gem: {
|
23
|
+
name: PACKAGE_NAME,
|
24
|
+
version: Lingo::VERSION,
|
25
|
+
summary: 'The full-featured automatic indexing system',
|
26
|
+
authors: ['John Vorhauer', 'Jens Wille'],
|
27
|
+
email: ['lingo@vorhauer.de', 'jens.wille@uni-koeln.de'],
|
28
|
+
homepage: 'http://lex-lingo.de',
|
29
|
+
extra_files: FileList[
|
30
|
+
'lingo.rb', 'lingo{,-all,-call}.cfg', 'lingo.opt', 'doc/**/*',
|
31
|
+
'{de,en}.lang', '{de,en}/{lingo-*,user-dic}.txt', 'txt/artikel{,-en}.txt',
|
32
|
+
'info/gpl-hdr.txt', 'info/*.png', 'lir.cfg', 'txt/lir.txt', 'porter/*',
|
33
|
+
'test.cfg', '{de,en}/test_*.txt'
|
34
|
+
].to_a,
|
35
|
+
required_ruby_version: '>= 1.9',
|
36
|
+
dependencies: [['ruby-nuggets', '>= 0.8.2'], 'unicode'],
|
37
|
+
development_dependencies: [['diff-lcs', '>= 1.1.3'], 'open4']
|
38
|
+
}
|
39
|
+
}}
|
40
|
+
rescue LoadError => err
|
41
|
+
warn "Please install the `hen' gem first. (#{err})"
|
42
|
+
end
|
43
|
+
|
44
|
+
CLEAN.include(
|
45
|
+
'txt/*.{log,mul,non,seq,syn,ve?,csv}',
|
46
|
+
'test/{test.*,text.non}',
|
47
|
+
'store/*/*.rev'
|
48
|
+
)
|
49
|
+
|
50
|
+
CLOBBER.include(
|
51
|
+
'store', 'doc' ,'pkg/*', PACKAGE_PATH + '.*'
|
52
|
+
)
|
53
|
+
|
54
|
+
task :checkdoc do
|
55
|
+
docfile = File.join(__DIR__, 'doc', 'index.html')
|
56
|
+
abort "Please run `rake doc' first." unless File.exists?(docfile)
|
57
|
+
end
|
58
|
+
|
59
|
+
desc 'Run ALL tests'
|
60
|
+
task 'test:all' => [:test, 'test:txt', 'test:lir']
|
61
|
+
|
62
|
+
Rake::TestTask.new(:test) do |t|
|
63
|
+
t.test_files = FileList.new('test/ts_*.rb', 'test/attendee/ts_*.rb')
|
64
|
+
end
|
65
|
+
|
66
|
+
desc 'Test against reference file (TXT)'
|
67
|
+
task 'test:txt' do
|
68
|
+
test_ref('artikel', 'test')
|
69
|
+
end
|
70
|
+
|
71
|
+
desc 'Test against reference file (LIR)'
|
72
|
+
task 'test:lir' do
|
73
|
+
test_ref('lir')
|
74
|
+
end
|
75
|
+
|
76
|
+
desc 'Run all tests on packaged distribution'
|
77
|
+
task 'test:remote' => [:package] do
|
78
|
+
chdir(PACKAGE_PATH) { system('rake test:all') } || abort
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_ref(name, cfg = name)
|
82
|
+
require 'nuggets/util/ruby'
|
83
|
+
|
84
|
+
require 'diff/lcs'
|
85
|
+
require 'diff/lcs/ldiff'
|
86
|
+
|
87
|
+
cmd = %W[lingo.rb -c #{cfg} txt/#{name}.txt]
|
88
|
+
continue, msg = 0, ["Command failed: #{cmd.join(' ')}"]
|
89
|
+
|
90
|
+
Process.ruby(*cmd) { |_, _, o, e|
|
91
|
+
IO.interact({}, { o => msg, e => msg })
|
92
|
+
}.success? or abort msg.join("\n\n")
|
93
|
+
|
94
|
+
Dir["test/ref/#{name}.*"].each { |ref|
|
95
|
+
puts "#{'#' * 60} #{org = ref.sub(/test\/ref/, 'txt')}"
|
96
|
+
continue += Diff::LCS::Ldiff.run(ARGV.clear << '-a' << org << ref)
|
97
|
+
}
|
98
|
+
|
99
|
+
exit continue + 1 unless continue.zero?
|
100
|
+
end
|
data/TODO
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
= ToDo list for Lingo
|
2
|
+
|
3
|
+
(most important first)
|
4
|
+
|
5
|
+
* Update and translate old documentation.
|
6
|
+
* Allow for handling of documents in various encodings, not just the one the
|
7
|
+
dictionaries are encoded in.
|
8
|
+
* Provide automatic encoding detection.
|
9
|
+
* Provide automatic language detection (as fine-grained as possible).
|
10
|
+
* Make lingo run faster!? (benchmark - profile - optimize)
|
11
|
+
* Replace SDBM by DBM (more platform-independent, no 1k limit on record size);
|
12
|
+
maybe QDBM/Tokyo Cabinet or even CDB for faster access.
|
13
|
+
* In-memory (volatile) vs. on-disk (persistent) dictionaries. It should be
|
14
|
+
possible to simply use the Lingo API without caring about dictionary storage.
|
15
|
+
* That being said, provide an easy-to-use Lingo API -- just 'require "lingo"'
|
16
|
+
and go for it!
|
17
|
+
* In addition to that, provide sensible string extensions: String#tokenize,
|
18
|
+
String#lemmatize, ...
|
19
|
+
* Provide a DSL for configuration -- in addition to, or instead of, the current
|
20
|
+
YAML format.
|
21
|
+
* Make sure the Crypter is sufficiently secure.
|
22
|
+
* Use RSpec for testing.
|
23
|
+
* Make Lingo capable to use multiple cores or even machines to boost performance
|
24
|
+
by connecting Attendees through sockets and use separate processes
|
25
|
+
|
26
|
+
NOTE: New code *should* meet the guidelines outlined in the
|
27
|
+
RubyStyleGuide[https://github.com/bbatsov/ruby-style-guide],
|
28
|
+
existing code will be adjusted along the way.
|