langa 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +674 -0
- data/README +69 -0
- data/bin/langa +169 -0
- data/examples/afrikaans_1953_utf8.txt +1000 -0
- data/examples/albanian_utf8.txt +1000 -0
- data/examples/amharic_utf8.txt +1000 -0
- data/examples/arabic_svd_utf8.txt +1000 -0
- data/examples/armenian_western_1853_utf8.txt +1000 -0
- data/examples/asv_utf8.txt +1000 -0
- data/examples/basque_1571_utf8.txt +1000 -0
- data/examples/breton_utf8.txt +1000 -0
- data/examples/chinese_ncv_s_utf8.txt +1000 -0
- data/examples/chinese_ncv_utf8.txt +1000 -0
- data/examples/chinese_union_s_utf8.txt +1000 -0
- data/examples/chinese_union_utf8.txt +1000 -0
- data/examples/coptic_nt_utf8.txt +1000 -0
- data/examples/croatian_utf8.txt +1000 -0
- data/examples/czech_bkr_utf8.txt +1000 -0
- data/examples/danish_utf8.txt +1000 -0
- data/examples/dutch_svv_utf8.txt +1000 -0
- data/examples/esperanto_utf8.txt +1000 -0
- data/examples/estonian_utf8.txt +1000 -0
- data/examples/finnish_pr_1992_utf8.txt +1000 -0
- data/examples/french_ostervald_1996_utf8.txt +1000 -0
- data/examples/german_schlachter_1951_utf8.txt +1000 -0
- data/examples/greek_byzantine_2000_utf8.txt +1000 -0
- data/examples/greek_modern_utf8.txt +1000 -0
- data/examples/hebrew_modern_utf8.txt +1000 -0
- data/examples/hungarian_karoli_utf8.txt +1000 -0
- data/examples/italian_riveduta_1927_utf8.txt +1000 -0
- data/examples/kabyle_nt_utf8.txt +1000 -0
- data/examples/kjv_apocrypha_utf8.txt +1000 -0
- data/examples/korean_utf8.txt +1000 -0
- data/examples/latin_vulgata_clementina_utf8.txt +1000 -0
- data/examples/latvian_nt_utf8.txt +1000 -0
- data/examples/lithuanian_utf8.txt +1000 -0
- data/examples/manx_gaelic_utf8.txt +1000 -0
- data/examples/maori_utf8.txt +1000 -0
- data/examples/myanmar_judson_1835_utf8.txt +1000 -0
- data/examples/norwegian_utf8.txt +1000 -0
- data/examples/peshitta_utf8.txt +1000 -0
- data/examples/portuguese_utf8.txt +1000 -0
- data/examples/romani_utf8.txt +1000 -0
- data/examples/romanian_cornilescu_utf8.txt +1000 -0
- data/examples/russian_makarij_utf8.txt +1000 -0
- data/examples/spanish_reina_valera_1909_utf8.txt +1000 -0
- data/examples/swedish_1917_utf8.txt +1000 -0
- data/examples/tagalog_1905_utf8.txt +1000 -0
- data/examples/thai_kjv_utf8.txt +1000 -0
- data/examples/turkish_nt_utf8.txt +1000 -0
- data/examples/turkish_utf8.txt +1000 -0
- data/examples/ukrainian_1871_utf8.txt +1000 -0
- data/examples/vietnamese_1934_utf8.txt +1000 -0
- data/examples/wolof_utf8.txt +1000 -0
- data/examples/xhosa_utf8.txt +1000 -0
- data/lib/langa.rb +35 -0
- data/lib/langa/dna.rb +209 -0
- data/lib/langa/file.rb +97 -0
- data/lib/langa/langa.dna +406 -0
- data/lib/langa/languageanalyzer.rb +134 -0
- data/lib/langa/languages.rb +147 -0
- data/lib/langa/randomtestfiles.rb +140 -0
- data/lib/langa/utilities.rb +53 -0
- data/test/tc_file.rb +47 -0
- data/test/tc_languages.rb +69 -0
- data/test/tc_utilities.rb +42 -0
- data/unicode/CaseFolding.txt +1065 -0
- data/unicode/CaseFolding.txt.webloc +8 -0
- data/unicode/Index of -Public-MAPPINGS.webloc b/data/unicode/Index of → -Public-MAPPINGS.webloc +0 -0
- data/unicode/mappings/8859-1.TXT +303 -0
- data/unicode/mappings/8859-10.TXT +303 -0
- data/unicode/mappings/8859-11.TXT +297 -0
- data/unicode/mappings/8859-13.TXT +299 -0
- data/unicode/mappings/8859-14.TXT +301 -0
- data/unicode/mappings/8859-15.TXT +303 -0
- data/unicode/mappings/8859-16.TXT +299 -0
- data/unicode/mappings/8859-2.TXT +303 -0
- data/unicode/mappings/8859-3.TXT +296 -0
- data/unicode/mappings/8859-4.TXT +303 -0
- data/unicode/mappings/8859-5.TXT +303 -0
- data/unicode/mappings/8859-6.TXT +260 -0
- data/unicode/mappings/8859-7.TXT +308 -0
- data/unicode/mappings/8859-8.TXT +270 -0
- data/unicode/mappings/8859-9.TXT +307 -0
- data/unicode/mappings/ATARIST.TXT +313 -0
- data/unicode/mappings/CP037.TXT +275 -0
- data/unicode/mappings/CP1006.TXT +302 -0
- data/unicode/mappings/CP1026.TXT +275 -0
- data/unicode/mappings/CP1250.TXT +274 -0
- data/unicode/mappings/CP1251.TXT +274 -0
- data/unicode/mappings/CP1252.TXT +274 -0
- data/unicode/mappings/CP1253.TXT +274 -0
- data/unicode/mappings/CP1254.TXT +274 -0
- data/unicode/mappings/CP1255.TXT +274 -0
- data/unicode/mappings/CP1256.TXT +274 -0
- data/unicode/mappings/CP1257.TXT +274 -0
- data/unicode/mappings/CP1258.TXT +274 -0
- data/unicode/mappings/CP424.TXT +304 -0
- data/unicode/mappings/CP437.TXT +274 -0
- data/unicode/mappings/CP500.TXT +275 -0
- data/unicode/mappings/CP737.TXT +274 -0
- data/unicode/mappings/CP775.TXT +275 -0
- data/unicode/mappings/CP850.TXT +274 -0
- data/unicode/mappings/CP852.TXT +274 -0
- data/unicode/mappings/CP855.TXT +275 -0
- data/unicode/mappings/CP856.TXT +303 -0
- data/unicode/mappings/CP857.TXT +275 -0
- data/unicode/mappings/CP860.TXT +275 -0
- data/unicode/mappings/CP861.TXT +275 -0
- data/unicode/mappings/CP862.TXT +275 -0
- data/unicode/mappings/CP863.TXT +275 -0
- data/unicode/mappings/CP864.TXT +275 -0
- data/unicode/mappings/CP865.TXT +275 -0
- data/unicode/mappings/CP866.TXT +275 -0
- data/unicode/mappings/CP869.TXT +275 -0
- data/unicode/mappings/CP874.TXT +274 -0
- data/unicode/mappings/CP875.TXT +275 -0
- data/unicode/mappings/KOI8-R.TXT +302 -0
- data/unicode/mappings/NEXTSTEP.TXT +173 -0
- data/unicode/mappings/ROMAN.TXT +275 -0
- data/unicode/mappings/US-ASCII-QUOTES.TXT +198 -0
- metadata +180 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
=begin
|
|
2
|
+
|
|
3
|
+
Copyright © 2007 John Vorhauer
|
|
4
|
+
Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
|
|
5
|
+
|
|
6
|
+
This file is part of Langa.
|
|
7
|
+
|
|
8
|
+
Langa is free software: you can redistribute it and/or modify
|
|
9
|
+
it under the terms of the GNU General Public License as published by
|
|
10
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
11
|
+
(at your option) any later version.
|
|
12
|
+
|
|
13
|
+
Langa is distributed in the hope that it will be useful,
|
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
GNU General Public License for more details.
|
|
17
|
+
|
|
18
|
+
You should have received a copy of the GNU General Public License
|
|
19
|
+
along with Langa. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
+
|
|
21
|
+
For a detailed functional description of Langa see README file
|
|
22
|
+
|
|
23
|
+
=end
|
|
24
|
+
|
|
25
|
+
require 'rubygems'
|
|
26
|
+
require 'languages'
|
|
27
|
+
|
|
28
|
+
=begin rdoc
|
|
29
|
+
|
|
30
|
+
The class LanguageAnalyzer is the heart of Langa. It has two main use
|
|
31
|
+
cases:
|
|
32
|
+
|
|
33
|
+
= Recognize the language of a textfile
|
|
34
|
+
In this mode the LanguageAnalyzer identifies the language of a textfile by
|
|
35
|
+
comparing the fingerprint of the textfile against the ones documented in
|
|
36
|
+
the language configuration file 'language.dna'. Call
|
|
37
|
+
|
|
38
|
+
la = LanguageAnalyzer.new
|
|
39
|
+
la.analyze_file('german-file') -> 'deu'
|
|
40
|
+
|
|
41
|
+
If you wish additional codepage conversion for the input file call
|
|
42
|
+
|
|
43
|
+
la.analyze_file('german-file-iso-8859-1', '8859-1') -> 'deu'
|
|
44
|
+
|
|
45
|
+
= Create a new language fingerprint
|
|
46
|
+
If you have a big textfile of a previously unknown language, you can
|
|
47
|
+
calculate the fingerprint of this language and add it to the language
|
|
48
|
+
configuration file 'language.dna'. Call
|
|
49
|
+
|
|
50
|
+
la = LanguageAnalyzer.new
|
|
51
|
+
la.scan_language_dna('landir/*')
|
|
52
|
+
|
|
53
|
+
to scan all files from the landir directory. To automatically identify
|
|
54
|
+
the iso 639 language codes and the codepage that should be used for reading,
|
|
55
|
+
name the input files in a form '<iso-code>.<Language>.<codepage>.txt', i.e.
|
|
56
|
+
'landir/deu.German.utf-8.txt'.
|
|
57
|
+
|
|
58
|
+
=end
|
|
59
|
+
|
|
60
|
+
class LanguageAnalyzer
|
|
61
|
+
|
|
62
|
+
# Create a new instance of the LanguageAnalyzer
|
|
63
|
+
# la = LanguageAnalyzer.new
|
|
64
|
+
def initialize(language_file='language.dna')
|
|
65
|
+
@languages = Languages.new(language_file)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Get the keys of all known languages.
|
|
69
|
+
# la.keys -> ['deu', 'eng', ...]
|
|
70
|
+
def keys
|
|
71
|
+
@languages.keys.sort
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Get the keys of all known languages.
|
|
75
|
+
# la.config('deu') -> {'name'=>'German', 'iso1'=>'de', ...}
|
|
76
|
+
def config(key)
|
|
77
|
+
@languages.config(key)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Get the source files of all known languages.
|
|
81
|
+
# la.sources -> ["corpora/ger.german.utf-8.txt", ...]
|
|
82
|
+
def sources
|
|
83
|
+
@languages.values_for('source').keys
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Analyze the language of a file. With the +full_detail+ toggle you
|
|
87
|
+
# can get a complete protokoll of teh analysis.
|
|
88
|
+
# la.analyze_file('german-file-utf8') -> 'deu'
|
|
89
|
+
# la.analyze('german-file-iso-8859-1', '8859-1') -> 'deu'
|
|
90
|
+
def analyze( filename, codepage='utf-8', full_detail=false )
|
|
91
|
+
dna = DNA.new
|
|
92
|
+
dna.feed(filename, codepage)
|
|
93
|
+
fp = dna.fingerprint
|
|
94
|
+
|
|
95
|
+
lang_score = Array.new
|
|
96
|
+
@languages.keys.each do |key|
|
|
97
|
+
lang = @languages.config(key)
|
|
98
|
+
lang_score << [dna.distance(lang['dna']), key, lang['name']]
|
|
99
|
+
end
|
|
100
|
+
full_detail ? lang_score.sort {|a,b| a[1]<=>b[1]} : lang_score.sort[0][1]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Create a new dna fingerprint for a big language file. The file should have
|
|
104
|
+
# at least 100.000 letters. The more, the better for the quality of the
|
|
105
|
+
# fingerprint and therefor for the quality of language recognition.
|
|
106
|
+
# To scan all files from a directory, use a wildcard. To automatically
|
|
107
|
+
# identify the iso 639 language codes and the codepage that should be used
|
|
108
|
+
# for reading, name the input files in a form
|
|
109
|
+
# '<iso-code>.<Language>.<codepage>.txt', i.e. 'landir/deu.German.utf-8.txt'.
|
|
110
|
+
# la.scan_language_dna('landir/*')
|
|
111
|
+
# Copy the output to to the language configuration file 'language.dna'.
|
|
112
|
+
def scan_language_dna( pattern = '*', codepage = 'utf-8' )
|
|
113
|
+
lang, language, cp = nil, nil, codepage
|
|
114
|
+
Dir[ pattern ].each do |filename|
|
|
115
|
+
# filename =~ %r|/([^\.]+)\.([^\.]+)\.([^\.]+)|
|
|
116
|
+
# lang, language, cp = $1, $2, $3
|
|
117
|
+
|
|
118
|
+
dna = DNA.new
|
|
119
|
+
dna.feed(filename, cp)
|
|
120
|
+
|
|
121
|
+
puts Languages.to_paste('<iso 639-3 code>', {
|
|
122
|
+
'name' => '<full language name>',
|
|
123
|
+
'iso1' => '<iso 639-1 code (optional)>',
|
|
124
|
+
'source' => filename,
|
|
125
|
+
'size' => dna.size,
|
|
126
|
+
'utf8' => dna.to_utf8,
|
|
127
|
+
'fingerprint' => dna.to_s })
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
#la = LanguageAnalyzer.new
|
|
134
|
+
#p la.sources
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
=begin
|
|
2
|
+
|
|
3
|
+
Copyright © 2007 John Vorhauer
|
|
4
|
+
Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
|
|
5
|
+
|
|
6
|
+
This file is part of Langa.
|
|
7
|
+
|
|
8
|
+
Langa is free software: you can redistribute it and/or modify
|
|
9
|
+
it under the terms of the GNU General Public License as published by
|
|
10
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
11
|
+
(at your option) any later version.
|
|
12
|
+
|
|
13
|
+
Langa is distributed in the hope that it will be useful,
|
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
GNU General Public License for more details.
|
|
17
|
+
|
|
18
|
+
You should have received a copy of the GNU General Public License
|
|
19
|
+
along with Langa. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
+
|
|
21
|
+
For a detailed functional description of Langa see README file
|
|
22
|
+
|
|
23
|
+
=end
|
|
24
|
+
|
|
25
|
+
require 'rubygems'
|
|
26
|
+
require 'yaml'
|
|
27
|
+
require 'dna'
|
|
28
|
+
|
|
29
|
+
=begin rdoc
|
|
30
|
+
The class Languages handles attributes of different languages, particularly
|
|
31
|
+
the dna fingerprint used for language recognition.
|
|
32
|
+
|
|
33
|
+
The attributes for each language are stored in a yaml files of the form:
|
|
34
|
+
|
|
35
|
+
<three letter iso 639-3 language code>:
|
|
36
|
+
name: <name of the language>
|
|
37
|
+
iso1: <two letter iso 639-1 language code (optional)>
|
|
38
|
+
bibl: <three letter iso 639-2 bibliographic code (optional)>
|
|
39
|
+
source: <source file used for fingerprint creation>
|
|
40
|
+
size: <number of relevant characters for fingerprint creation>
|
|
41
|
+
utf8: <utf-8 representation of fingerprint>
|
|
42
|
+
fingerprint: <dna fingerprint of language>
|
|
43
|
+
|
|
44
|
+
i.e. this is shown for the german language
|
|
45
|
+
|
|
46
|
+
deu:
|
|
47
|
+
name: German
|
|
48
|
+
iso1: de
|
|
49
|
+
bibl: ger
|
|
50
|
+
source: corpora/ger.german.utf-8.txt
|
|
51
|
+
size: 92273185
|
|
52
|
+
utf8: enirtsadhlugcmobfkwzpvüäjöyxq
|
|
53
|
+
fingerprint: 101-16251+110-9918+105-7865+114-7637+116-6348...
|
|
54
|
+
|
|
55
|
+
For ISO 639-x codes see http://www.sil.org/ISO639-3/codes.asp
|
|
56
|
+
|
|
57
|
+
=end
|
|
58
|
+
|
|
59
|
+
class Languages
|
|
60
|
+
|
|
61
|
+
@languages = nil
|
|
62
|
+
|
|
63
|
+
# Create the YAML representation of a language configuration for
|
|
64
|
+
# manually pasting this to the language configuration file.
|
|
65
|
+
# Languages.to_paste('deu', {...}) -> "deu:\n name: German\n ..."
|
|
66
|
+
def Languages.to_paste(key, config, indent=4)
|
|
67
|
+
ind = ' ' * indent
|
|
68
|
+
cnf = config.dup
|
|
69
|
+
str = "#{key}:\n"
|
|
70
|
+
['name','iso1','bibl','source','size','utf8','fingerprint'].each do |key|
|
|
71
|
+
if cnf.has_key?(key)
|
|
72
|
+
str << "%s%s:%s%s\n" % [ind, key,
|
|
73
|
+
ind[(key.size+1).modulo(indent)..indent-1], cnf[key]]
|
|
74
|
+
cnf.delete(key)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
cnf.each do |key, value|
|
|
78
|
+
str << "%s%s:%s%s\n" % [ind, key,
|
|
79
|
+
ind[(key.size+1).modulo(indent)..indent-1], value]
|
|
80
|
+
end
|
|
81
|
+
str
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Create a Language object to query language attributes
|
|
85
|
+
# la = Languages.new('language.dna')
|
|
86
|
+
def initialize(config_file)
|
|
87
|
+
@languages = load_language_configuration(config_file)
|
|
88
|
+
self
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Get the keys of all known languages. The keys are named according to
|
|
92
|
+
# ISO 639-3
|
|
93
|
+
# la.keys -> ['deu', 'eng', ...]
|
|
94
|
+
def keys
|
|
95
|
+
@languages.keys.sort
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Get the values of all languages for given attribute +name+. With +lcase+
|
|
99
|
+
# you can force the results to be lowercased.
|
|
100
|
+
# la.values_for('name') -> {'German'=>'deu', 'English'=>'eng', ...}
|
|
101
|
+
# la.values_for('name', true) -> {'german'=>'deu', 'english'=>'eng', ...}
|
|
102
|
+
def values_for(name, lcase=false)
|
|
103
|
+
return nil if name.nil?
|
|
104
|
+
|
|
105
|
+
result = Hash.new
|
|
106
|
+
@languages.each do |key, val|
|
|
107
|
+
unless val[name].nil?
|
|
108
|
+
result[lcase ? val[name].downcase : val[name]] = key
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
result
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Get the complete configuration for a specific language. You can use any
|
|
115
|
+
# ISO 639 shortcut as a key (i.e. for german you can use 'deu', 'de' and
|
|
116
|
+
# 'ger')
|
|
117
|
+
# la.config('deu') -> {"name"=>"German", "iso1"=>"de", "bibl"=>"ger", ...}
|
|
118
|
+
def config(key)
|
|
119
|
+
# => tranlate key, unless present
|
|
120
|
+
unless @languages.has_key?(key)
|
|
121
|
+
map = values_for('iso1')
|
|
122
|
+
unless map.has_key?(key)
|
|
123
|
+
map = values_for('bibl')
|
|
124
|
+
unless map.has_key?(key)
|
|
125
|
+
map = values_for('name', true)
|
|
126
|
+
key = nil unless map.has_key?(key)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
key = map[key] unless key.nil?
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
key.nil? ? nil : @languages[key]
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
private
|
|
137
|
+
|
|
138
|
+
def load_language_configuration(filename)
|
|
139
|
+
YAML.load(File.open(filename)).each_value do |lang|
|
|
140
|
+
lang['dna'] = DNA.new(lang['fingerprint'])
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
#la = Languages.new
|
|
147
|
+
#p la.values_for('name', true)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
=begin
|
|
2
|
+
|
|
3
|
+
Copyright © 2007 John Vorhauer
|
|
4
|
+
Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
|
|
5
|
+
|
|
6
|
+
This file is part of Langa.
|
|
7
|
+
|
|
8
|
+
Langa is free software: you can redistribute it and/or modify
|
|
9
|
+
it under the terms of the GNU General Public License as published by
|
|
10
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
11
|
+
(at your option) any later version.
|
|
12
|
+
|
|
13
|
+
Langa is distributed in the hope that it will be useful,
|
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
GNU General Public License for more details.
|
|
17
|
+
|
|
18
|
+
You should have received a copy of the GNU General Public License
|
|
19
|
+
along with Langa. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
+
|
|
21
|
+
For a detailed functional description of Langa see README file
|
|
22
|
+
|
|
23
|
+
=end
|
|
24
|
+
|
|
25
|
+
require 'rubygems'
|
|
26
|
+
|
|
27
|
+
require 'fileutils'
|
|
28
|
+
require 'languages'
|
|
29
|
+
|
|
30
|
+
=begin
|
|
31
|
+
|
|
32
|
+
The class RandomTestFiles creates various files in different languages for
|
|
33
|
+
testing purposes. Therefor it scans the language configuration for source
|
|
34
|
+
files and, if existing, extracts a portion from random position of it to an
|
|
35
|
+
extra file in a testing directory. The files are removed, when object life
|
|
36
|
+
cycle ends.
|
|
37
|
+
|
|
38
|
+
=end
|
|
39
|
+
|
|
40
|
+
class RandomTestFiles
|
|
41
|
+
|
|
42
|
+
RTF_DIR = 'rltf.temp'.freeze
|
|
43
|
+
|
|
44
|
+
def initialize(file_count, chunk_size, &block)
|
|
45
|
+
# Store test parameter
|
|
46
|
+
@count = file_count
|
|
47
|
+
@size = chunk_size
|
|
48
|
+
|
|
49
|
+
# remember root path
|
|
50
|
+
@root = File.join(File.dirname(__FILE__), '..', '..')
|
|
51
|
+
@rtf_dir = File.join(@root, RTF_DIR)
|
|
52
|
+
|
|
53
|
+
# check source files for existance
|
|
54
|
+
@sources = []
|
|
55
|
+
source_file = dest_files = nil
|
|
56
|
+
Languages.new.values_for('source').each do |source, lang|
|
|
57
|
+
source_file = File.join(@root, source)
|
|
58
|
+
|
|
59
|
+
if File.exist?(source_file)
|
|
60
|
+
dest_files = []
|
|
61
|
+
|
|
62
|
+
# => create number of desired output files
|
|
63
|
+
(1..file_count).each do |i|
|
|
64
|
+
dest_files <<
|
|
65
|
+
File.join(@rtf_dir, source + "-#{@count}-#{@size}")
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# => store filename
|
|
69
|
+
@sources << [lang, source_file, dest_files]
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# => initialize random seed for different results
|
|
74
|
+
srand (Time.new.to_f * 10000).to_i
|
|
75
|
+
|
|
76
|
+
self.each &block if block_given?
|
|
77
|
+
|
|
78
|
+
self
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
alias :create :initialize
|
|
82
|
+
|
|
83
|
+
def each
|
|
84
|
+
# => create temp directory
|
|
85
|
+
FileUtils.mkdir_p(RTF_DIR)
|
|
86
|
+
|
|
87
|
+
@sources.each do |source| la, lang, filename = source
|
|
88
|
+
yield la, copy_random_pieces(filename, @file_count, @chunk_size)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
self.clear
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def clear
|
|
95
|
+
# FileUtils.rm(RLTF_DIRECTORY + '/*.txt')
|
|
96
|
+
FileUtils.rm_r(RLTF_DIRECTORY)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def copy_random_pieces(filename, file_count, chunk_size)
|
|
100
|
+
files = []
|
|
101
|
+
|
|
102
|
+
path = File.join(File.dirname(__FILE__), '..', '..')
|
|
103
|
+
source_file = File.join(path, filename)
|
|
104
|
+
|
|
105
|
+
# => remember size of source, so that we don't grab past EOF
|
|
106
|
+
file_size = File.exist?(source_file) ? File.size(source_file) : 0
|
|
107
|
+
return nil if chunk_size > file_size
|
|
108
|
+
|
|
109
|
+
# => open source file
|
|
110
|
+
File.open(source_file) do |file|
|
|
111
|
+
|
|
112
|
+
# => create number of desired output files
|
|
113
|
+
(1..file_count).each do |i|
|
|
114
|
+
|
|
115
|
+
# => grab a randomly piece of fixed size from the source
|
|
116
|
+
file.seek(rand(file_size-chunk_size), IO::SEEK_SET)
|
|
117
|
+
piece = file.read(chunk_size)
|
|
118
|
+
|
|
119
|
+
# => fix potentially broken utf-8 sequences at start of piece
|
|
120
|
+
piece = piece[1..-1] while (0x80..0xbf).include?(piece[0])
|
|
121
|
+
# => fix potentially broken utf-8 sequences at end of piece
|
|
122
|
+
piece = piece[0..-2] while (0x80..0xbf).include?(piece[-1])
|
|
123
|
+
piece = piece[0..-2] if (0xc0..0xff).include?(piece[-1])
|
|
124
|
+
|
|
125
|
+
# => write piece to test files
|
|
126
|
+
source_file =~ /([^\/]+).txt$/i
|
|
127
|
+
files << File.join(path, "#{RLTF_DIRECTORY}/#{$1}.#{chunk_size}.#{i}.txt")
|
|
128
|
+
File.open(files[-1], 'w') do |fout|
|
|
129
|
+
fout.write piece
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
files
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
#rlf = RandomTestFiles.new(5,100).each do |a,b|
|
|
139
|
+
# p [a, b]
|
|
140
|
+
#end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
=begin
|
|
2
|
+
|
|
3
|
+
Copyright © 2007 John Vorhauer
|
|
4
|
+
Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
|
|
5
|
+
|
|
6
|
+
This file is part of Langa.
|
|
7
|
+
|
|
8
|
+
Langa is free software: you can redistribute it and/or modify
|
|
9
|
+
it under the terms of the GNU General Public License as published by
|
|
10
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
11
|
+
(at your option) any later version.
|
|
12
|
+
|
|
13
|
+
Langa is distributed in the hope that it will be useful,
|
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
GNU General Public License for more details.
|
|
17
|
+
|
|
18
|
+
You should have received a copy of the GNU General Public License
|
|
19
|
+
along with Langa. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
+
|
|
21
|
+
For a detailed functional description of Langa see README file
|
|
22
|
+
|
|
23
|
+
=end
|
|
24
|
+
|
|
25
|
+
=begin rdoc
|
|
26
|
+
|
|
27
|
+
Extend class Array and class String to uniformly handle code conversion
|
|
28
|
+
to and from unicode
|
|
29
|
+
|
|
30
|
+
=end
|
|
31
|
+
|
|
32
|
+
class String
|
|
33
|
+
|
|
34
|
+
# Converts a string to an array of unicode values.
|
|
35
|
+
# 'äöü'.to_unicode => [228, 246, 252]
|
|
36
|
+
# 'äöü' as UTF-8 is equivalent to [195, 164, 195, 182, 195, 188]
|
|
37
|
+
def to_unicode
|
|
38
|
+
self.unpack('U*')
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
class Array
|
|
44
|
+
|
|
45
|
+
# Converts an array of unicodes values to an utf-8 coded string
|
|
46
|
+
# [228, 246, 252].to_uft8 -> 'äöü'
|
|
47
|
+
|
|
48
|
+
def to_utf8
|
|
49
|
+
self.pack('U*')
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
end
|
|
53
|
+
|