langa 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +674 -0
- data/README +69 -0
- data/bin/langa +169 -0
- data/examples/afrikaans_1953_utf8.txt +1000 -0
- data/examples/albanian_utf8.txt +1000 -0
- data/examples/amharic_utf8.txt +1000 -0
- data/examples/arabic_svd_utf8.txt +1000 -0
- data/examples/armenian_western_1853_utf8.txt +1000 -0
- data/examples/asv_utf8.txt +1000 -0
- data/examples/basque_1571_utf8.txt +1000 -0
- data/examples/breton_utf8.txt +1000 -0
- data/examples/chinese_ncv_s_utf8.txt +1000 -0
- data/examples/chinese_ncv_utf8.txt +1000 -0
- data/examples/chinese_union_s_utf8.txt +1000 -0
- data/examples/chinese_union_utf8.txt +1000 -0
- data/examples/coptic_nt_utf8.txt +1000 -0
- data/examples/croatian_utf8.txt +1000 -0
- data/examples/czech_bkr_utf8.txt +1000 -0
- data/examples/danish_utf8.txt +1000 -0
- data/examples/dutch_svv_utf8.txt +1000 -0
- data/examples/esperanto_utf8.txt +1000 -0
- data/examples/estonian_utf8.txt +1000 -0
- data/examples/finnish_pr_1992_utf8.txt +1000 -0
- data/examples/french_ostervald_1996_utf8.txt +1000 -0
- data/examples/german_schlachter_1951_utf8.txt +1000 -0
- data/examples/greek_byzantine_2000_utf8.txt +1000 -0
- data/examples/greek_modern_utf8.txt +1000 -0
- data/examples/hebrew_modern_utf8.txt +1000 -0
- data/examples/hungarian_karoli_utf8.txt +1000 -0
- data/examples/italian_riveduta_1927_utf8.txt +1000 -0
- data/examples/kabyle_nt_utf8.txt +1000 -0
- data/examples/kjv_apocrypha_utf8.txt +1000 -0
- data/examples/korean_utf8.txt +1000 -0
- data/examples/latin_vulgata_clementina_utf8.txt +1000 -0
- data/examples/latvian_nt_utf8.txt +1000 -0
- data/examples/lithuanian_utf8.txt +1000 -0
- data/examples/manx_gaelic_utf8.txt +1000 -0
- data/examples/maori_utf8.txt +1000 -0
- data/examples/myanmar_judson_1835_utf8.txt +1000 -0
- data/examples/norwegian_utf8.txt +1000 -0
- data/examples/peshitta_utf8.txt +1000 -0
- data/examples/portuguese_utf8.txt +1000 -0
- data/examples/romani_utf8.txt +1000 -0
- data/examples/romanian_cornilescu_utf8.txt +1000 -0
- data/examples/russian_makarij_utf8.txt +1000 -0
- data/examples/spanish_reina_valera_1909_utf8.txt +1000 -0
- data/examples/swedish_1917_utf8.txt +1000 -0
- data/examples/tagalog_1905_utf8.txt +1000 -0
- data/examples/thai_kjv_utf8.txt +1000 -0
- data/examples/turkish_nt_utf8.txt +1000 -0
- data/examples/turkish_utf8.txt +1000 -0
- data/examples/ukrainian_1871_utf8.txt +1000 -0
- data/examples/vietnamese_1934_utf8.txt +1000 -0
- data/examples/wolof_utf8.txt +1000 -0
- data/examples/xhosa_utf8.txt +1000 -0
- data/lib/langa.rb +35 -0
- data/lib/langa/dna.rb +209 -0
- data/lib/langa/file.rb +97 -0
- data/lib/langa/langa.dna +406 -0
- data/lib/langa/languageanalyzer.rb +134 -0
- data/lib/langa/languages.rb +147 -0
- data/lib/langa/randomtestfiles.rb +140 -0
- data/lib/langa/utilities.rb +53 -0
- data/test/tc_file.rb +47 -0
- data/test/tc_languages.rb +69 -0
- data/test/tc_utilities.rb +42 -0
- data/unicode/CaseFolding.txt +1065 -0
- data/unicode/CaseFolding.txt.webloc +8 -0
- data/unicode/Index of -Public-MAPPINGS.webloc b/data/unicode/Index of → -Public-MAPPINGS.webloc +0 -0
- data/unicode/mappings/8859-1.TXT +303 -0
- data/unicode/mappings/8859-10.TXT +303 -0
- data/unicode/mappings/8859-11.TXT +297 -0
- data/unicode/mappings/8859-13.TXT +299 -0
- data/unicode/mappings/8859-14.TXT +301 -0
- data/unicode/mappings/8859-15.TXT +303 -0
- data/unicode/mappings/8859-16.TXT +299 -0
- data/unicode/mappings/8859-2.TXT +303 -0
- data/unicode/mappings/8859-3.TXT +296 -0
- data/unicode/mappings/8859-4.TXT +303 -0
- data/unicode/mappings/8859-5.TXT +303 -0
- data/unicode/mappings/8859-6.TXT +260 -0
- data/unicode/mappings/8859-7.TXT +308 -0
- data/unicode/mappings/8859-8.TXT +270 -0
- data/unicode/mappings/8859-9.TXT +307 -0
- data/unicode/mappings/ATARIST.TXT +313 -0
- data/unicode/mappings/CP037.TXT +275 -0
- data/unicode/mappings/CP1006.TXT +302 -0
- data/unicode/mappings/CP1026.TXT +275 -0
- data/unicode/mappings/CP1250.TXT +274 -0
- data/unicode/mappings/CP1251.TXT +274 -0
- data/unicode/mappings/CP1252.TXT +274 -0
- data/unicode/mappings/CP1253.TXT +274 -0
- data/unicode/mappings/CP1254.TXT +274 -0
- data/unicode/mappings/CP1255.TXT +274 -0
- data/unicode/mappings/CP1256.TXT +274 -0
- data/unicode/mappings/CP1257.TXT +274 -0
- data/unicode/mappings/CP1258.TXT +274 -0
- data/unicode/mappings/CP424.TXT +304 -0
- data/unicode/mappings/CP437.TXT +274 -0
- data/unicode/mappings/CP500.TXT +275 -0
- data/unicode/mappings/CP737.TXT +274 -0
- data/unicode/mappings/CP775.TXT +275 -0
- data/unicode/mappings/CP850.TXT +274 -0
- data/unicode/mappings/CP852.TXT +274 -0
- data/unicode/mappings/CP855.TXT +275 -0
- data/unicode/mappings/CP856.TXT +303 -0
- data/unicode/mappings/CP857.TXT +275 -0
- data/unicode/mappings/CP860.TXT +275 -0
- data/unicode/mappings/CP861.TXT +275 -0
- data/unicode/mappings/CP862.TXT +275 -0
- data/unicode/mappings/CP863.TXT +275 -0
- data/unicode/mappings/CP864.TXT +275 -0
- data/unicode/mappings/CP865.TXT +275 -0
- data/unicode/mappings/CP866.TXT +275 -0
- data/unicode/mappings/CP869.TXT +275 -0
- data/unicode/mappings/CP874.TXT +274 -0
- data/unicode/mappings/CP875.TXT +275 -0
- data/unicode/mappings/KOI8-R.TXT +302 -0
- data/unicode/mappings/NEXTSTEP.TXT +173 -0
- data/unicode/mappings/ROMAN.TXT +275 -0
- data/unicode/mappings/US-ASCII-QUOTES.TXT +198 -0
- metadata +180 -0
data/lib/langa.rb
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
=begin
|
|
2
|
+
|
|
3
|
+
Copyright © 2007 John Vorhauer
|
|
4
|
+
Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
|
|
5
|
+
|
|
6
|
+
This file is part of Langa.
|
|
7
|
+
|
|
8
|
+
Langa is free software: you can redistribute it and/or modify
|
|
9
|
+
it under the terms of the GNU General Public License as published by
|
|
10
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
11
|
+
(at your option) any later version.
|
|
12
|
+
|
|
13
|
+
Langa is distributed in the hope that it will be useful,
|
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
GNU General Public License for more details.
|
|
17
|
+
|
|
18
|
+
You should have received a copy of the GNU General Public License
|
|
19
|
+
along with Langa. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
+
|
|
21
|
+
For a detailed functional description of Langa see README file
|
|
22
|
+
|
|
23
|
+
=end
|
|
24
|
+
|
|
25
|
+
$:.unshift File.join(File.dirname(__FILE__), 'langa')
|
|
26
|
+
|
|
27
|
+
require 'rubygems'
|
|
28
|
+
require 'utilities'
|
|
29
|
+
require 'file'
|
|
30
|
+
require 'dna'
|
|
31
|
+
require 'languages'
|
|
32
|
+
require 'languageanalyzer'
|
|
33
|
+
require 'randomtestfiles'
|
|
34
|
+
require 'getoptlong'
|
|
35
|
+
|
data/lib/langa/dna.rb
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
=begin
|
|
2
|
+
|
|
3
|
+
Copyright © 2007 John Vorhauer
|
|
4
|
+
Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
|
|
5
|
+
|
|
6
|
+
This file is part of Langa.
|
|
7
|
+
|
|
8
|
+
Langa is free software: you can redistribute it and/or modify
|
|
9
|
+
it under the terms of the GNU General Public License as published by
|
|
10
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
11
|
+
(at your option) any later version.
|
|
12
|
+
|
|
13
|
+
Langa is distributed in the hope that it will be useful,
|
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
GNU General Public License for more details.
|
|
17
|
+
|
|
18
|
+
You should have received a copy of the GNU General Public License
|
|
19
|
+
along with Langa. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
+
|
|
21
|
+
For a detailed functional description of Langa see README file
|
|
22
|
+
|
|
23
|
+
=end
|
|
24
|
+
|
|
25
|
+
=begin
|
|
26
|
+
|
|
27
|
+
The class DNA creates a typical fingerprint from a unicode character stream.
|
|
28
|
+
This fingerprint can be compared with fingerprints of other test streams to
|
|
29
|
+
support an automatic language recognition.
|
|
30
|
+
|
|
31
|
+
The fingerprint is a statistical analysis of the frequency of occurance of
|
|
32
|
+
single characters. With the analysis non letter characters are filtered and
|
|
33
|
+
upper case letters are mapped to lowercase.
|
|
34
|
+
|
|
35
|
+
The distance between two fingerprints is measured in the sum of distances
|
|
36
|
+
between each single letter.
|
|
37
|
+
|
|
38
|
+
=end
|
|
39
|
+
|
|
40
|
+
class DNA
|
|
41
|
+
|
|
42
|
+
@@gene_map = Hash.new
|
|
43
|
+
|
|
44
|
+
# The gene map has two main purposes for all characters in the range
|
|
45
|
+
# of U+0000 to U+024F:
|
|
46
|
+
#
|
|
47
|
+
# 1. Map a unicode character from uppercase to lowercase
|
|
48
|
+
# 2. Filter relevant characters from punctuation and spacing characters
|
|
49
|
+
#
|
|
50
|
+
# For the first purpose the gene map relies on the CaseFolding.txt file
|
|
51
|
+
# from unicode.org. For the second purpose some relevant characters are
|
|
52
|
+
# mapped by hand, because they are not differentiated bewteen
|
|
53
|
+
# upper/lowercase. Grab latest version from
|
|
54
|
+
# http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
|
|
55
|
+
|
|
56
|
+
def DNA.fill_gene_map
|
|
57
|
+
# => find lokal CaseFolding.txt
|
|
58
|
+
case_fold = File.join(File.dirname(__FILE__), '..', '..', 'unicode', 'CaseFolding.txt')
|
|
59
|
+
|
|
60
|
+
# => load uppwer-/lowercase mappings
|
|
61
|
+
File.open(case_fold).each_line do |line|
|
|
62
|
+
# Line format looks like
|
|
63
|
+
# 0041; C; 0061; # LATIN CAPITAL LETTER A
|
|
64
|
+
code, stat, mapp = line.gsub(/ /, '').split(';')
|
|
65
|
+
if stat=='C' || stat=='S'
|
|
66
|
+
code, mapp = code.hex, mapp.hex
|
|
67
|
+
@@gene_map[code] = @@gene_map[mapp] = mapp
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# complete mapping for use as legal character identification
|
|
72
|
+
[0x130, 0x131, 0x138, 0x149, 0x180, 0x18d, 0x19b, 0x1aa, 0x1ab, 0x1ba,
|
|
73
|
+
0x1bb, 0x1be, 0x1f0, 0x221, 0x234, 0x235, 0x236, 0x237, 0x238, 0x239,
|
|
74
|
+
0x23a, 0x23e, 0x23f, 0x240].each { |code| @@gene_map[code] = code }
|
|
75
|
+
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# => dna_chain holds the dna while growing
|
|
80
|
+
@dna_chain = nil
|
|
81
|
+
@dna_size = 0
|
|
82
|
+
|
|
83
|
+
# => fingerprint is the final identifier of the dna
|
|
84
|
+
@fingerprint = nil
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# Create a new DNA object. You can create a DNA object from an existing
|
|
88
|
+
# fingerprint.
|
|
89
|
+
# d = DNA.new
|
|
90
|
+
# d = DNA.new(fingerprint)
|
|
91
|
+
def initialize(*parm)
|
|
92
|
+
# => initialize class variable
|
|
93
|
+
@@gene_map.empty? && DNA.fill_gene_map
|
|
94
|
+
|
|
95
|
+
# => check parameters
|
|
96
|
+
case parm.size
|
|
97
|
+
when 0
|
|
98
|
+
@dna_chain = Hash.new(0)
|
|
99
|
+
when 1
|
|
100
|
+
if parm[0].is_a?(String)
|
|
101
|
+
# => create dna object from fingerprint
|
|
102
|
+
@fingerprint = Hash.new
|
|
103
|
+
parm[0].scan(/([^+-]+)-([^+-]+)/).each do |gene|
|
|
104
|
+
idx, @fingerprint[idx] = gene.collect {|var| var.to_i}
|
|
105
|
+
end
|
|
106
|
+
else
|
|
107
|
+
raise ArgumentError, "wrong type of argument (String expected)"
|
|
108
|
+
end
|
|
109
|
+
else
|
|
110
|
+
raise ArgumentError, "wrong number of argument (#{parm.size} for 0/1)"
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Add an unicode character to the dna chain. This can be done in precedence
|
|
116
|
+
# to calculating the dna fingerprint. If the fingerprint was already
|
|
117
|
+
# calculated, you have to reset the dna object, before you can add another
|
|
118
|
+
# character.
|
|
119
|
+
# add_gene(0x123)
|
|
120
|
+
|
|
121
|
+
def add_gene(unicode)
|
|
122
|
+
raise "fingerprint already calculated, try reset first" unless @fingerprint.nil?
|
|
123
|
+
if unicode > 0x0250
|
|
124
|
+
@dna_chain[unicode] += 1 unless unicode === (0x2b0..0x2af)
|
|
125
|
+
else
|
|
126
|
+
@dna_chain[@@gene_map[unicode]] += 1 if @@gene_map.has_key?(unicode)
|
|
127
|
+
end
|
|
128
|
+
@dna_size += 1
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# With feed you can give complete files as an input to the dna. You
|
|
132
|
+
# must specify a codepage for input character conversion in precedence
|
|
133
|
+
# of dna calculations (preferably UTF-8). For codepage namings see class
|
|
134
|
+
# File.
|
|
135
|
+
# dna = DNA.new
|
|
136
|
+
# dna.feed('input-text', '8859-1')
|
|
137
|
+
|
|
138
|
+
def feed(filename, codepage)
|
|
139
|
+
self.reset
|
|
140
|
+
File.open([filename, codepage]).each_unicode {|uc| add_gene(uc) }
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# The fingerprint is the significant extract of a file, which is essentially
|
|
144
|
+
# for the language recognition process.
|
|
145
|
+
|
|
146
|
+
def fingerprint
|
|
147
|
+
if @fingerprint.nil?
|
|
148
|
+
# => filter gene, that are least significant
|
|
149
|
+
filter = (@dna_chain.size > 1000) ? 100 : 10
|
|
150
|
+
|
|
151
|
+
# => check the length of the chain, i.e. number of characters
|
|
152
|
+
length = weight = 0
|
|
153
|
+
@dna_chain.each { |pair| length += pair[1] }
|
|
154
|
+
@size = length
|
|
155
|
+
|
|
156
|
+
# => normalize the frequence of characters
|
|
157
|
+
@fingerprint = @dna_chain.collect { |gene| char, freq = gene
|
|
158
|
+
weight = (freq * 100000.0 / length).to_i
|
|
159
|
+
(weight > filter) ? [char, weight] : nil
|
|
160
|
+
}.compact.sort {|a,b| b[1]<=>a[1]}
|
|
161
|
+
end
|
|
162
|
+
@fingerprint
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Calculate the distance bewteen two fingerprint to measure the equality.
|
|
166
|
+
# dna.distance(other_dna) -> distance
|
|
167
|
+
def distance(dna)
|
|
168
|
+
fp = dna.fingerprint
|
|
169
|
+
dst = 0
|
|
170
|
+
@fingerprint.each do |gene| char, freq = gene
|
|
171
|
+
dst += (fp.has_key?(char) ? (fp[char]-freq).abs : freq)
|
|
172
|
+
end
|
|
173
|
+
dst / 1000.0
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Reset the DNA object
|
|
177
|
+
def reset
|
|
178
|
+
@dna_chain.clear
|
|
179
|
+
@dna_size = 0
|
|
180
|
+
@fingerprint = nil
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def size
|
|
184
|
+
@dna_size
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Convert the fingerprint to an UTF-8 string.
|
|
188
|
+
# dna.to_utf_8 -> 'enirtsadhlugcmobfkwzpvüäjöyxq'
|
|
189
|
+
def to_utf8
|
|
190
|
+
fingerprint.collect {|pair| pair[0]}.to_utf8
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Convert the fingerprint to a string.
|
|
194
|
+
# dna.to_s -> '101-16251+110-9918+105-7865+...'
|
|
195
|
+
def to_s
|
|
196
|
+
fingerprint.collect { |gene| gene.join('-') }.join('+')
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# def distance_orig(reference)
|
|
200
|
+
# ref = reference.dup
|
|
201
|
+
# @fingerprint.each do |gene| char, count = gene
|
|
202
|
+
# ref.has_key?(char) && ref[char]=(ref[char]>count)?(ref[char]-count):0
|
|
203
|
+
# end
|
|
204
|
+
# dst = 0
|
|
205
|
+
# ref.each {|k,v| dst += v if k.is_a?(Numeric)}
|
|
206
|
+
# dst / 1000.0
|
|
207
|
+
# end
|
|
208
|
+
|
|
209
|
+
end
|
data/lib/langa/file.rb
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
=begin
|
|
2
|
+
|
|
3
|
+
Copyright © 2007 John Vorhauer
|
|
4
|
+
Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
|
|
5
|
+
|
|
6
|
+
This file is part of Langa.
|
|
7
|
+
|
|
8
|
+
Langa is free software: you can redistribute it and/or modify
|
|
9
|
+
it under the terms of the GNU General Public License as published by
|
|
10
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
11
|
+
(at your option) any later version.
|
|
12
|
+
|
|
13
|
+
Langa is distributed in the hope that it will be useful,
|
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
GNU General Public License for more details.
|
|
17
|
+
|
|
18
|
+
You should have received a copy of the GNU General Public License
|
|
19
|
+
along with Langa. If not, see <http://www.gnu.org/licenses/>.
|
|
20
|
+
|
|
21
|
+
For a detailed functional description of Langa see README file
|
|
22
|
+
|
|
23
|
+
=end
|
|
24
|
+
|
|
25
|
+
=begin rdoc
|
|
26
|
+
|
|
27
|
+
Extend class File to convert characters from different codepages into an
|
|
28
|
+
unicode stream.
|
|
29
|
+
|
|
30
|
+
You can specify any codepage, that is listed in the directory
|
|
31
|
+
#{LANGA}/unicode/mappings (omit the '.txt' extension).
|
|
32
|
+
Grab the latest versions from http://www.unicode.org/Public/MAPPINGS
|
|
33
|
+
|
|
34
|
+
=end
|
|
35
|
+
|
|
36
|
+
class File
|
|
37
|
+
alias old_initialize initialize
|
|
38
|
+
|
|
39
|
+
# Specify a codepage with the new or open method. If no codepage is
|
|
40
|
+
# specified, UTF-8 will be assumed.
|
|
41
|
+
# file = File.open([filename, codepage], ...)
|
|
42
|
+
def initialize(*par)
|
|
43
|
+
self.codepage = 'utf-8'
|
|
44
|
+
self.codepage, par[0] = par[0].reverse if par[0].is_a?(Array)
|
|
45
|
+
if par[1] =~ /[wa]/
|
|
46
|
+
raise "mode not supported with codepage conversion"
|
|
47
|
+
end
|
|
48
|
+
old_initialize(*par)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Specify a codepage after open and before processing the file.
|
|
52
|
+
# file.codepage = '8859-4'
|
|
53
|
+
def codepage=(cp)
|
|
54
|
+
@codepage = cp
|
|
55
|
+
unless utf8?
|
|
56
|
+
@code_map = Array.new(256, 0)
|
|
57
|
+
maps = File.join(File.dirname(__FILE__), '..', '..', 'unicode', 'mappings')
|
|
58
|
+
File.open(File.join(maps, "#{cp}.txt")).each_line do |line|
|
|
59
|
+
line.downcase =~ %r|^([0-9a-fx]+)\s([0-9a-fx]+)| &&
|
|
60
|
+
@code_map[$1.hex] = $2.hex
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Walk trought a file step by step for each unicode character.
|
|
66
|
+
# file.each_unicode { |unicode| ... }
|
|
67
|
+
def each_unicode
|
|
68
|
+
readlines.each do |line|
|
|
69
|
+
transcode(line).each do |char|
|
|
70
|
+
yield char
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# True, if codepage of file is UTF-8
|
|
76
|
+
# file.utf8? -> true
|
|
77
|
+
def utf8?
|
|
78
|
+
@codepage == 'utf-8' || @codepage == 'utf8'
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
# Convert a string from codepage coding to unicode
|
|
85
|
+
def transcode(str)
|
|
86
|
+
unicode = nil
|
|
87
|
+
if utf8?
|
|
88
|
+
unicode = str.to_unicode
|
|
89
|
+
else
|
|
90
|
+
unicode = []
|
|
91
|
+
str.each_byte {|byte| unicode << @code_map[byte] }
|
|
92
|
+
end
|
|
93
|
+
unicode
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
end
|
|
97
|
+
|
data/lib/langa/langa.dna
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
---
|
|
2
|
+
afk:
|
|
3
|
+
name: Afrikaans
|
|
4
|
+
iso1: af
|
|
5
|
+
source: bible/afrikaans_1953/afrikaans_1953_utf8.txt
|
|
6
|
+
size: 3232617
|
|
7
|
+
utf8: eaniodrstlghmuvkwybpjfêëéíáócäú
|
|
8
|
+
fingerprint: 101-17648+97-8725+110-8531+105-7167+111-6711+100-6026+114-5526+115-5517+116-5095+108-4392+103-3582+104-3129+109-2643+117-2532+118-2440+107-2391+119-2011+121-1553+98-1161+112-971+106-875+102-608+234-249+235-149+233-91+237-70+225-70+243-37+99-27+228-27+250-12
|
|
9
|
+
|
|
10
|
+
sqi:
|
|
11
|
+
name: Albanian
|
|
12
|
+
iso1: sq
|
|
13
|
+
bibl: alb
|
|
14
|
+
source: bible/albanian/albanian_utf8.txt
|
|
15
|
+
size: 3160948
|
|
16
|
+
utf8: ëteiarnhsodjumkplbgvqzfyçcx
|
|
17
|
+
fingerprint: 235-10285+116-9353+101-8926+105-7899+97-6944+114-6647+110-6168+104-4860+115-4544+111-4290+100-4128+106-3965+117-3677+109-3102+107-2718+112-2629+108-2115+98-1576+103-1378+118-1102+113-984+122-903+102-734+121-662+231-198+99-165+120-30
|
|
18
|
+
|
|
19
|
+
amh:
|
|
20
|
+
name: Amharic
|
|
21
|
+
iso1: am
|
|
22
|
+
source: bible/amharic/amharic_utf8.txt
|
|
23
|
+
size: 421502
|
|
24
|
+
utf8: ንምእአበርትውስለልየ።ይነnመደናተችሁያከግድወብላ፥ሰገህታሉሱራማረ፤ቀባሆዚቸክሚጠዘቅፈዳሳጋዲጥፍሞቶጣሩቱቃዋሕሥሔኢካሙኛዱሎኝዛዎጌኑዝሌሊኔጊኋዜሮኖዓቢፋዙጅቡቆቁሲፊጡጽጸኃሪጉሐቤቻዕሣሃሄጀዩሴዶጢዮጻሬቹጂፉሶጆሽጎዴጦጭኳሠጨኞሸፎቴሻኤዊኩሜኪጳቂዬቲጴኵኅቼፅጁዞጮኘጫቦሦዖኙኸቈቍጐቄኮጃጹሾኒፃሡፀሂሹጲጄቋፁፌሺጓጕጶጤኬ
|
|
25
|
+
fingerprint: 4757-6355+4637-4061+4773-3399+4768-3041+4704-2965+4653-2895+4725-2761+4813-2712+4661-2659+4616-2598+4621-2255+4840-2158+4962-2007+4845-1999+4752-1919+110-1819+4632-1798+4848-1770+4755-1716+4720-1658+4733-1614+4609-1577+4843-1421+4776-1417+4877-1391+4853-1359+4808-1355+4709-1352+4619-1309+4965-1242+4656-1154+4872-1143+4613-1020+4723-908+4617-804+4657-803+4651-787+4635-761+4648-732+4964-731+4672-729+4707-712+4614-692+4826-663+4728-615+4781-594+4634-555+4896-548+4824-520+4677-519+4936-518+4851-505+4659-483+4875-467+4850-443+4901-442+4941-432+4638-417+4726-414+4899-398+4649-350+4721-350+4675-323+4811-317+4629-315+4645-307+4628-293+4770-285+4779-278+4633-261+4763-245+4849-241+4622-241+4765-237+4827-226+4814-226+4876-220+4753-219+4829-217+4620-217+4618-216+4756-216+4874-214+4747-212+4828-210+4654-208+4758-204+4819-190+4706-187+4939-180+4825-173+4869-167+4705-166+4678-156+4673-153+4658-148+4938-147+4897-141+4925-139+4920-137+4739-130+4650-125+4873-124+4624-121+4708-118+4731-116+4821-116+4643-108+4611-106+4612-106+4864-105+4841-98+4660-98+4854-97+4898-95+4846-94+4923-93+4652-93+4729-87+4866-86+4937-84+4662-83+4870-83+4669-82+4878-77+4852-75+4902-74+4909-71+4787-71+4640-71+4904-69+4766-63+4664-62+4942-61+4724-60+4667-60+4772-60+4810-58+4777-53+4636-53+4778-47+4915-47+4674-44+4844-42+4722-42+4916-42+4789-42+4741-41+4732-39+4933-39+4865-39+4830-35+4910-35+4760-33+4907-33+4710-32+4646-31+4822-29+4761-29+4792-28+4680-27+4685-27+4880-27+4676-27+4782-26+4867-25+4921-25+4670-25+4754-24+4931-22+4641-21+4928-20+4610-19+4665-15+4914-15+4868-15+4683-14+4929-14+4940-13+4666-13+4883-12+4885-12+4918-11+4900-11+4780-11
|
|
26
|
+
|
|
27
|
+
ara:
|
|
28
|
+
name: Arabic
|
|
29
|
+
iso1: ar
|
|
30
|
+
source: bible/arabic_svd/arabic_svd_utf8.txt
|
|
31
|
+
size: 1913503
|
|
32
|
+
utf8: اليومنربهعتكفسدقحجةoذشخىصطضئثnّزأءغآظؤ
|
|
33
|
+
fingerprint: 1575-16041+1604-10781+1610-7450+1608-6617+1605-6376+1606-5991+1585-4336+1576-4169+1607-3668+1593-3248+1578-3233+1603-3135+1601-2570+1587-2422+1583-2315+1602-1971+1581-1551+1580-1375+1577-1346+111-1210+1584-1180+1588-947+1582-918+1609-914+1589-730+1591-686+1590-639+1574-561+1579-532+110-416+1617-397+1586-390+1571-388+1569-355+8206-284+1594-282+1570-232+1592-199+1572-102
|
|
34
|
+
|
|
35
|
+
arc:
|
|
36
|
+
name: Aramaic
|
|
37
|
+
source: bible/peshitta/peshitta_utf8.txt
|
|
38
|
+
size: 477091
|
|
39
|
+
utf8: ܐܘܢܝܠܕܡܗܬܪܒܟܫܥܚ܀nܩܦܤܓܛܙܨ
|
|
40
|
+
fingerprint: 1808-13524+1816-9865+1826-9307+1821-8743+1824-7178+1813-6238+1825-6166+1815-5318+1836-5069+1834-4171+1810-4138+1823-2870+1835-2697+1829-2512+1818-2194+1792-1670+110-1670+1833-1408+1830-1395+1828-1242+1811-864+1819-792+1817-605+1832-315
|
|
41
|
+
|
|
42
|
+
hye:
|
|
43
|
+
name: Armenian
|
|
44
|
+
iso1: hy
|
|
45
|
+
bibl: arm
|
|
46
|
+
source: bible/armenian_western_1853/armenian_western_1853_utf8.txt
|
|
47
|
+
size: 692147
|
|
48
|
+
utf8: անորեիւսմտէկըnյքցլհպբթվդզղչծգխջռձշօփճժ
|
|
49
|
+
fingerprint: 1377-12429+1398-8854+1400-8031+1408-7801+1381-7001+1387-5885+1410-5729+1405-3496+1396-3272+1407-3135+1383-3093+1391-2705+1384-2505+110-2303+1397-2089+1412-2078+1409-1706+1388-1624+1392-1595+1402-1336+1378-1210+1385-1193+1406-1192+1380-1170+1382-979+1394-950+1401-949+1390-948+1379-870+1389-563+1403-558+1404-552+1393-538+1399-538+1413-458+1411-264+1395-198+1386-155
|
|
50
|
+
|
|
51
|
+
eus:
|
|
52
|
+
name: Basque
|
|
53
|
+
iso1: eu
|
|
54
|
+
bibl: baq
|
|
55
|
+
source: bible/basque_1571/basque_1571_utf8.txt
|
|
56
|
+
size: 844751
|
|
57
|
+
utf8: aenitrucodhbzsglçmpqéákvyfäóxè
|
|
58
|
+
fingerprint: 97-14547+101-13695+110-9531+105-9261+116-7328+114-7126+117-6146+99-5510+111-3435+100-3112+104-2606+98-2375+122-2283+115-2253+103-2237+108-2061+231-1871+109-1010+112-926+113-459+233-424+225-365+107-332+118-319+121-310+102-290+228-66+243-63+120-17+232-17
|
|
59
|
+
|
|
60
|
+
bre:
|
|
61
|
+
name: Breton
|
|
62
|
+
iso1: br
|
|
63
|
+
source: bible/breton/breton_utf8.txt
|
|
64
|
+
size: 330669
|
|
65
|
+
utf8: eanorthduzlisvgmkbcpñwjfùyê
|
|
66
|
+
fingerprint: 101-15077+97-14962+110-8028+111-6448+114-6201+116-5021+104-4400+100-4203+117-3992+122-3937+108-3719+105-3679+115-3144+118-2904+103-2422+109-2406+107-2054+98-1431+99-1287+112-1263+241-1262+119-613+106-604+102-343+249-333+121-153+234-88
|
|
67
|
+
|
|
68
|
+
mya:
|
|
69
|
+
name: Burmese
|
|
70
|
+
iso1: my
|
|
71
|
+
source: bible/myanmar_judson_1835/myanmar_judson_1835_utf8.txt
|
|
72
|
+
size: 3822858
|
|
73
|
+
utf8: ်ာုိေးသကငမတ့ရြအှညနလပစ၊ျူခ။ါွ၏ထယဘံoီဖ၍ဆဝဟဲ၌္ဒnဇဗ၎ဂဉဣဤဏဧဥဦဃဌဩဓဋ
|
|
74
|
+
fingerprint: 4154-10974+4140-6153+4143-5815+4141-5348+4145-4864+4152-4767+4126-4634+4096-4496+4100-3964+4121-3680+4112-3479+4151-3092+4123-2771+4156-2713+4129-2235+4158-2221+4106-2142+4116-2105+4124-2051+4117-2041+4101-1533+4170-1387+4155-1387+4144-1323+4097-1286+4171-1156+4139-1092+4157-1070+4175-890+4113-887+4122-881+4120-776+4150-646+111-602+4142-561+4118-540+4173-520+4102-501+4125-441+4127-423+4146-406+4172-347+4153-244+4114-228+110-207+4103-195+4119-160+4174-117+4098-102+4105-102+4131-81+4132-57+4111-51+4135-49+4133-46+4134-31+4099-21+4108-18+4137-15+4115-14+4107-12
|
|
75
|
+
|
|
76
|
+
cat:
|
|
77
|
+
name: Catalan - Valencian
|
|
78
|
+
iso1: ca
|
|
79
|
+
source: corpora/cat.catalan-valencian.utf-8.txt
|
|
80
|
+
size: 29853676
|
|
81
|
+
utf8: easirtlnocdumpbvgqfóhxéàjèíyòzçúkïwüáñ
|
|
82
|
+
fingerprint: 101-12764+97-11745+115-8078+105-7578+114-6928+116-6488+108-6450+110-6369+111-5009+99-4517+100-4211+117-3806+109-3024+112-2879+98-1316+118-1239+103-1236+113-1025+102-990+243-673+104-506+120-482+233-432+224-408+106-330+232-270+237-260+121-226+242-202+122-170+231-97+250-93+107-38+239-36+119-35+252-31+225-23+241-11
|
|
83
|
+
|
|
84
|
+
zho:
|
|
85
|
+
name: Chinese
|
|
86
|
+
iso1: zh
|
|
87
|
+
source: bible/chinese_union/chinese_union_utf8.txt
|
|
88
|
+
size: 1085910
|
|
89
|
+
utf8: ,的。o他你我們人在和是耶說;不要以就有:n為了子一華這所、上來那必亞大神裡到因中地從去利列都也王使?見得兒行將色給又與事作可著出 約前日之對西個撒樣時!看如麼祭心自並下面用拉話能知城主把拿眾同道叫基沒當起向雅十聽安手被羅巴法比門穌求路呢各天住己民若罪百切候家生多無甚女死長此名像頭回過於摩好猶─聖惡發瑪)衛(物後但告伯國二打受哈離些卻年父
|
|
90
|
+
fingerprint: 65292-5348+30340-4498+12290-2763+111-2132+20182-2080+20320-1837+25105-1804+20497-1761+20154-1585+22312-1207+21644-1148+26159-1123+32822-967+35498-877+65307-874+19981-868+35201-768+20197-760+23601-759+26377-758+65306-755+110-734+28858-732+20102-728+23376-697+19968-663+33775-649+36889-637+25152-618+12289-584+19978-537+20358-533+37027-515+24517-488+20126-457+22823-451+31070-438+35041-437+21040-434+22240-415+20013-404+22320-401+24478-372+21435-360+21033-350+21015-349+37117-346+20063-346+29579-321+20351-309+65311-302+35211-299+24471-295+20818-294+34892-282+23559-279+33394-275+32102-266+21448-262+33287-253+20107-249+20316-248+21487-247+33879-245+20986-241+12288-240+32004-235+21069-233+26085-229+20043-229+23565-227+35199-225+20491-225+25746-224+27171-221+26178-220+65281-217+30475-213+22914-208+40636-203+31085-202+24515-198+33258-196+20006-196+19979-194+38754-190+29992-185+25289-182+35441-181+33021-181+30693-180+22478-178+20027-176+25226-175+25343-175+30526-174+21516-173+36947-172+21483-170+22522-170+27794-169+30070-169+36215-168+21521-168+38597-164+21313-164+32893-163+23433-161+25163-157+34987-154+32645-154+24052-153+27861-152+27604-152+38272-151+31308-151+27714-150+36335-149+21602-148+21508-147+22825-147+20303-146+24049-146+27665-146+33509-144+32618-144+30334-144+20999-143+20505-143+23478-142+29983-142+22810-139+28961-139+29978-135+22899-135+27515-135+38263-135+27492-134+21517-133+20687-132+38957-132+22238-130+36942-129+26044-127+25705-121+22909-120+29494-120+9472-120+32854-119+24801-118+30332-118+29802-118+65289-116+34907-116+65288-115+29289-113+24460-112+20294-111+21578-110+20271-109+22283-108+20108-108+25171-108+21463-105+21704-105+38626-104+20123-102+21371-102+24180-102+29238-101
|
|
91
|
+
|
|
92
|
+
cop:
|
|
93
|
+
name: Coptic
|
|
94
|
+
source: bible/coptic_nt/coptic_nt_utf8.txt
|
|
95
|
+
size: 664953
|
|
96
|
+
utf8: ⲉⲛⲟⲁⲓⲩⲧⲙⲥⲡⲣϩⲱⲏϥϫϣⲃⲗϯⲕnϧⲫⲑⲇⲭⲅϭⲍⲝⲯ
|
|
97
|
+
fingerprint: 11401-12762+11419-10528+11423-9195+11393-7820+11411-6787+11433-5630+11431-4919+11417-4897+11429-3451+11425-3360+11427-3216+1001-3098+11441-2910+11407-2817+997-2590+1003-2199+995-1585+11395-1523+11415-1498+1007-1380+11413-1257+110-1198+999-1164+11435-1131+11409-1097+11399-700+11437-457+11397-425+1005-245+11405-65+11421-35+11439-22
|
|
98
|
+
|
|
99
|
+
ces:
|
|
100
|
+
name: Czech
|
|
101
|
+
iso1: cs
|
|
102
|
+
source: bible/czech_bkr/czech_bkr_utf8.txt
|
|
103
|
+
size: 2809145
|
|
104
|
+
utf8: oeanitslvdmkuphjríbcyzážěřéšýčůťfgúňďó
|
|
105
|
+
fingerprint: 111-8776+101-7924+97-6779+110-5508+105-5311+116-5023+115-4821+108-4572+118-4374+100-3864+109-3463+107-3334+117-3032+112-2867+104-2820+106-2801+114-2637+237-2475+98-2316+99-2020+121-1956+122-1923+225-1873+382-1823+283-1532+345-1287+233-1263+353-1161+253-752+269-602+367-528+357-200+102-108+103-87+250-50+328-49+271-47+243-21
|
|
106
|
+
|
|
107
|
+
dan:
|
|
108
|
+
name: Danish
|
|
109
|
+
iso1: da
|
|
110
|
+
source: corpora/dan.danish.utf-8.txt
|
|
111
|
+
size: 7401657
|
|
112
|
+
utf8: erntisdalogmkfvubphåæøjycwxzé
|
|
113
|
+
fingerprint: 101-15862+114-8668+110-7473+116-6906+105-6145+115-6078+100-6022+97-5793+108-5316+111-4664+103-4365+109-3256+107-3241+102-2533+118-2467+117-1842+98-1617+112-1577+104-1515+229-1055+230-900+248-845+106-663+121-627+99-400+119-63+120-33+122-24+233-17
|
|
114
|
+
|
|
115
|
+
nld:
|
|
116
|
+
name: Dutch-Flemish
|
|
117
|
+
iso1: nl
|
|
118
|
+
bibl: dut
|
|
119
|
+
source: corpora/dut.dutch-flemish.utf-8.txt
|
|
120
|
+
size: 8406113
|
|
121
|
+
utf8: enaitrodslgvmhkubpcjwzfyxéëqï
|
|
122
|
+
fingerprint: 101-18458+110-9769+97-7554+105-6763+116-6684+114-6457+111-5899+100-5385+115-4258+108-3927+103-2953+118-2517+109-2434+104-2325+107-2315+117-2020+98-1620+112-1618+99-1597+106-1457+119-1447+122-1185+102-914+121-139+120-74+233-73+235-67+113-24+239-12
|
|
123
|
+
|
|
124
|
+
eng:
|
|
125
|
+
name: English
|
|
126
|
+
iso1: en
|
|
127
|
+
source: bible/asv/asv_utf8.txt
|
|
128
|
+
size: 3266058
|
|
129
|
+
utf8: ethaonisrdlfumwygcbpvkjzxq
|
|
130
|
+
fingerprint: 101-12583+116-9593+104-8828+97-8648+111-8203+110-7144+105-5921+115-5798+114-5003+100-4626+108-3743+102-2565+117-2534+109-2438+119-1979+121-1810+103-1683+99-1608+98-1491+112-1310+118-1138+107-695+106-479+122-93+120-45+113-29
|
|
131
|
+
|
|
132
|
+
epo:
|
|
133
|
+
name: Esperanto
|
|
134
|
+
iso1: eo
|
|
135
|
+
source: bible/esperanto/esperanto_utf8.txt
|
|
136
|
+
size: 3071751
|
|
137
|
+
utf8: aioenlsrtjkudmxpvgcfbhz
|
|
138
|
+
fingerprint: 97-11947+105-10309+111-9333+101-7953+110-7419+108-6675+115-5944+114-5536+116-4501+106-4394+107-4239+117-3721+100-3246+109-2658+120-2595+112-2102+118-1935+103-1694+99-1122+102-1038+98-831+104-459+122-337
|
|
139
|
+
|
|
140
|
+
est:
|
|
141
|
+
name: Estonian
|
|
142
|
+
iso1: et
|
|
143
|
+
bibl: est
|
|
144
|
+
source: corpora/est.estonian.utf-8.txt
|
|
145
|
+
size: 27559980
|
|
146
|
+
utf8: aeistlunkodmrvgpjhäõbüöfczyw
|
|
147
|
+
fingerprint: 97-11797+101-11304+105-9947+115-8971+116-7745+108-6161+117-5919+110-4908+107-4772+111-4035+100-3822+109-3687+114-3214+118-2375+103-1843+112-1748+106-1614+104-1547+228-1298+245-1176+98-835+252-711+246-281+102-156+99-52+122-21+121-19+119-16
|
|
148
|
+
|
|
149
|
+
fin:
|
|
150
|
+
name: Finnish
|
|
151
|
+
iso1: fi
|
|
152
|
+
bibl: fin
|
|
153
|
+
source: corpora/fin.finnish.utf-8.txt
|
|
154
|
+
size: 9661950
|
|
155
|
+
utf8: aitenslokuämrvjyhpdögfbcwx
|
|
156
|
+
fingerprint: 97-11837+105-10520+116-10114+101-8269+110-8145+115-7750+108-5634+111-5566+107-5304+117-5173+228-3914+109-3198+114-2463+118-2447+106-2053+121-1888+104-1818+112-1807+100-949+246-627+103-178+102-94+98-90+99-76+119-35+120-15
|
|
157
|
+
|
|
158
|
+
fra:
|
|
159
|
+
name: French
|
|
160
|
+
iso1: fr
|
|
161
|
+
bibl: fre
|
|
162
|
+
source: corpora/fre.french.utf-8.txt
|
|
163
|
+
size: 10752039
|
|
164
|
+
utf8: esanirtulodcpmévfgqbhàxjèyêzkôçwîâùû
|
|
165
|
+
fingerprint: 101-14208+115-8057+97-7848+110-7520+105-7294+114-7002+116-6876+117-5681+108-5559+111-5385+100-4181+99-3406+112-2988+109-2694+233-2545+118-1354+102-1120+103-1041+113-921+98-897+104-822+224-472+120-421+106-420+232-351+121-285+234-138+122-102+107-102+244-62+231-59+119-36+238-34+226-32+249-25+251-21
|
|
166
|
+
|
|
167
|
+
gla:
|
|
168
|
+
name: Gaelic
|
|
169
|
+
iso1: gd
|
|
170
|
+
source: bible/manx_gaelic/manx_gaelic_utf8.txt
|
|
171
|
+
size: 388891
|
|
172
|
+
utf8: eayhnsroidgltcmuvjbfkpwqz
|
|
173
|
+
fingerprint: 101-13343+97-10207+121-8944+104-8379+110-8263+115-7064+114-7023+111-6238+105-5135+100-3790+103-3702+108-3610+116-3227+99-2262+109-2065+117-1628+118-1438+106-925+98-843+102-512+107-490+112-452+119-345+113-70+122-29
|
|
174
|
+
|
|
175
|
+
deu:
|
|
176
|
+
name: German
|
|
177
|
+
iso1: de
|
|
178
|
+
bibl: ger
|
|
179
|
+
source: corpora/ger.german.utf-8.txt
|
|
180
|
+
size: 92273185
|
|
181
|
+
utf8: enirtsadhlugcmobfkwzpvüäjöyxq
|
|
182
|
+
fingerprint: 101-16251+110-9918+105-7865+114-7637+116-6348+115-6297+97-5912+100-4806+104-4285+108-3760+117-3715+103-3023+99-2757+109-2724+111-2721+98-2076+102-1755+107-1501+119-1432+122-1238+112-1031+118-936+252-675+228-586+106-276+246-264+121-109+120-55+113-26
|
|
183
|
+
|
|
184
|
+
ell:
|
|
185
|
+
name: Greek - modern (1453-)
|
|
186
|
+
iso1: el
|
|
187
|
+
bibl: gre
|
|
188
|
+
source: bible/greek_modern/greek_modern_utf8.txt
|
|
189
|
+
size: 3400574
|
|
190
|
+
utf8: αοεισντυηκρπλωμδθγβχoφξnζψ
|
|
191
|
+
fingerprint: 945-11466+959-9820+949-9687+953-9421+963-7862+957-7585+964-7395+965-5894+951-3929+954-3673+961-3439+960-2932+955-2881+969-2826+956-2444+948-1953+952-1845+947-1183+946-739+967-690+111-681+966-653+958-376+110-234+950-215+968-163
|
|
192
|
+
|
|
193
|
+
heb:
|
|
194
|
+
name: Hebrew
|
|
195
|
+
iso1: he
|
|
196
|
+
source: bible/hebrew_modern/hebrew_modern_utf8.txt
|
|
197
|
+
size: 1688847
|
|
198
|
+
utf8: יוהאלרבתשמםענכדח׃oקןפךצגזסטnץף
|
|
199
|
+
fingerprint: 1497-11007+1493-10527+1492-8213+1488-8045+1500-7084+1512-5330+1489-5099+1514-5066+1513-4607+1502-4605+1501-3594+1506-3513+1504-3203+1499-2811+1491-2600+1495-2209+1475-1841+111-1371+1511-1286+1503-1254+1508-1237+1498-981+1510-876+1490-826+1494-705+1505-662+1496-529+110-472+1509-231+1507-191
|
|
200
|
+
|
|
201
|
+
hun:
|
|
202
|
+
name: Hungarian
|
|
203
|
+
iso1: hu
|
|
204
|
+
source: bible/hungarian_karoli/hungarian_karoli_utf8.txt
|
|
205
|
+
size: 3096513
|
|
206
|
+
utf8: eatnklsézoimárgdyvhbjõföpuóíúücû
|
|
207
|
+
fingerprint: 101-10749+97-9182+116-7349+110-6455+107-5688+108-5655+115-5611+233-4416+122-4350+111-4278+105-4173+109-4031+225-3472+114-3330+103-3232+100-2684+121-1874+118-1852+104-1528+98-1516+106-1367+245-1049+102-1038+246-1010+112-832+117-737+243-694+237-462+250-427+252-411+99-411+251-120
|
|
208
|
+
|
|
209
|
+
ita:
|
|
210
|
+
name: Italian
|
|
211
|
+
iso1: it
|
|
212
|
+
source: corpora/ita.italian.utf-8.txt
|
|
213
|
+
size: 29036444
|
|
214
|
+
utf8: aieontrlscdpumgvfhzbqèàùìkyéxjw
|
|
215
|
+
fingerprint: 97-11456+105-11392+101-10791+111-9152+110-7348+116-6805+114-6609+108-6405+115-5037+99-4399+100-3728+112-2900+117-2848+109-2558+103-1723+118-1548+102-1053+104-1031+122-990+98-952+113-417+232-293+224-273+249-81+236-45+107-37+121-29+233-27+120-22+106-18+119-18
|
|
216
|
+
|
|
217
|
+
jpn:
|
|
218
|
+
name: Japanese
|
|
219
|
+
iso1: ja
|
|
220
|
+
source: corpora/jp.japan.utf-8.txt
|
|
221
|
+
size: 3767346
|
|
222
|
+
utf8: の、。たにはをいしとがるでてなーかっれらもンり」「1日すうさ2まこ0スだト年きルめ)(け人どく大3会んイや一あえッ中同ラ5本よ出市リつ月ア4ク・国者上合ち回わ後ド前タ場行長6事ロ十時手地そみ発内7二プ分シ見打今8県レせ開フカ9高生業ム目勝間戦メ定ジ約入部チ自テ三子対員方選バ点学マグ社コ田民連お決度全ば用サ表議最体通新立明げ調町的実
|
|
223
|
+
fingerprint: 12398-3249+12289-2721+12290-2649+12383-2239+12395-2097+12399-1965+12434-1961+12356-1779+12375-1687+12392-1634+12364-1632+12427-1613+12391-1589+12390-1321+12394-1218+12540-1153+12363-889+12387-882+12428-803+12425-770+12418-692+12531-684+12426-639+12301-635+12300-629+65297-599+26085-595+12377-586+12358-503+12373-484+65298-478+12414-458+12371-455+65296-453+12473-448+12384-436+12488-425+24180-402+12365-398+12523-398+12417-382+65289-381+65288-381+12369-379+20154-379+12393-375+12367-375+22823-331+65299-329+20250-326+12435-321+12452-318+12420-304+19968-288+12354-278+12360-277+12483-273+20013-272+21516-270+12521-257+65301-253+26412-249+12424-248+20986-245+24066-244+12522-242+12388-242+26376-242+12450-239+65300-238+12463-229+12539-224+22269-224+32773-220+19978-218+21512-206+12385-206+22238-204+12431-200+24460-196+12489-194+21069-194+12479-193+22580-192+34892-189+38263-188+65302-187+20107-187+12525-185+21313-185+26178-185+25163-182+22320-180+12381-177+12415-176+30330-174+20869-172+65303-171+20108-170+12503-169+20998-168+12471-163+35211-162+25171-161+20170-160+65304-160+30476-159+12524-159+12379-158+38283-157+12501-157+12459-156+65305-156+39640-155+29983-155+26989-154+12512-152+30446-151+21213-150+38291-149+25126-148+12513-146+23450-144+12472-144+32004-143+20837-142+37096-141+12481-141+33258-138+12486-137+19977-136+23376-135+23550-133+21729-132+26041-130+36984-129+12496-125+28857-125+23398-123+12510-122+12464-121+31038-121+12467-121+30000-120+27665-120+36899-119+12362-117+27770-115+24230-114+20840-114+12400-113+29992-113+12469-112+34920-110+35696-110+26368-109+20307-109+36890-108+26032-105+31435-104+26126-103+12370-102+35519-102+30010-101+30340-101+23455-101
|
|
224
|
+
|
|
225
|
+
kab:
|
|
226
|
+
name: Kabyle
|
|
227
|
+
source: bible/kabyle_nt/kabyle_nt_utf8.txt
|
|
228
|
+
size: 735419
|
|
229
|
+
utf8: eanidtrslmuykwbɣgɛhcfqzxj
|
|
230
|
+
fingerprint: 101-12692+97-12480+110-10358+105-9610+100-6033+116-5337+114-5028+115-4991+108-4443+109-4330+117-3265+121-3018+107-2625+119-2600+98-2070+611-1886+103-1804+603-1300+104-1257+99-1131+102-1101+113-1094+122-783+120-604+106-144
|
|
231
|
+
|
|
232
|
+
kor:
|
|
233
|
+
name: Korean
|
|
234
|
+
iso1: ko
|
|
235
|
+
source: corpora/kr.korean.utf-8.txt
|
|
236
|
+
size: 15874880
|
|
237
|
+
utf8: 이다의는에을고지한로가대기하사은서정를도자부해으시인국일있수제장원전들나리과보주것상어라아적했경관회동만그위공구당문등조중계개선성화무면소게비업여신명우통민내안방유와세실미연할재교년치거금산러진용스행발단영체백간모되오분학김려천난현야입차된마표요생결반며역력운터후없니각건말법직설권출강북식본물데않합히임속월때최외남었작까청노씨예종처달불심점두호바군르감양따총령련울던협책받파추평판음태급포근석트특환집날또타드증매찰배
|
|
238
|
+
fingerprint: 51060-3170+45796-2505+51032-2171+45716-2114+50640-1940+51012-1746+44256-1581+51648-1431+54620-1404+47196-1394+44032-1342+45824-1242+44592-1219+54616-1215+49324-1139+51008-1138+49436-1051+51221-975+47484-970+46020-945+51088-920+48512-858+54644-847+51004-813+49884-804+51064-793+44397-754+51068-749+51080-747+49688-734+51228-698+51109-693+50896-666+51204-653+46308-653+45208-621+47532-612+44284-602+48372-590+51452-578+44163-561+49345-561+50612-557+46972-528+50500-520+51201-494+54664-493+44221-477+44288-472+54924-466+46041-459+47564-448+44536-446+50948-431+44277-419+44396-419+45817-418+47928-412+46321-412+51312-410+51473-396+44228-388+44060-384+49440-381+49457-368+54868-365+47924-364+47732-356+49548-350+44172-343+48708-342+50629-336+50668-332+49888-330+47749-330+50864-329+53685-329+48124-327+45236-316+50504-306+48169-305+50976-294+50752-293+49464-292+49892-291+48120-290+50672-288+54624-280+51116-280+44368-279+45380-277+52824-277+44144-273+44552-270+49328-267+47084-265+51652-265+50857-261+49828-258+54665-251+48156-248+45800-248+50689-236+52404-234+48177-232+44036-230+47784-230+46104-228+50724-223+48516-223+54617-216+44608-210+47140-207+52380-207+45212-206+54788-206+50556-204+51077-203+52264-202+46108-201+47560-200+54364-199+50836-198+49373-198+44208-196+48152-194+47728-194+50669-193+47141-192+50868-188+53552-188+54980-187+50630-187+45768-185+44033-182+44148-181+47568-181+48277-181+51649-181+49444-179+44428-177+52636-175+44053-172+48513-172+49885-172+48376-169+47932-169+45936-169+50506-169+54633-169+55176-164+51076-162+49549-162+50900-160+46412-157+52572-154+50808-153+45224-152+50632-152+51089-150+44620-150+52397-146+45432-145+50472-145+50696-145+51333-144+52376-144+45804-143+48520-142+49900-142+51216-141+46160-141+54840-138+48148-134+44400-133+47476-132+44048-132+50577-132+46384-131+52509-130+47161-130+47144-130+50872-123+45912-122+54801-122+52293-117+48155-117+54028-116+52628-115+54217-114+54032-114+51020-114+53468-113+44553-111+54252-109+44540-109+49437-108+53944-107+53945-104+54872-104+51665-103+45216-103+46608-103+53440-103+46300-102+51613-102+47588-102+52272-102+48176-102
|
|
239
|
+
|
|
240
|
+
lat:
|
|
241
|
+
name: Latin
|
|
242
|
+
iso1: la
|
|
243
|
+
source: bible/latin_vulgata_clementina/latin_vulgata_clementina_utf8.txt
|
|
244
|
+
size: 3326239
|
|
245
|
+
utf8: eitusanormcdlpbvqgfhæjxyëzœ
|
|
246
|
+
fingerprint: 101-11692+105-11144+116-8911+117-8124+115-7571+97-7497+110-6565+111-5996+114-5791+109-5633+99-3509+100-3344+108-2704+112-2312+98-1557+118-1459+113-1377+103-1035+102-938+104-840+230-688+106-563+120-455+121-105+235-89+122-56+339-27
|
|
247
|
+
|
|
248
|
+
lav:
|
|
249
|
+
name: Latvian
|
|
250
|
+
iso1: lv
|
|
251
|
+
source: bible/latvian_nt/latvian_nt_utf8.txt
|
|
252
|
+
size: 652159
|
|
253
|
+
utf8: aisenturmāvkdjlpoīēbzcgņšūļžķģfhč
|
|
254
|
+
fingerprint: 97-10928+105-9618+115-8594+101-6433+110-6162+116-6119+117-5874+114-3788+109-3666+257-3638+118-3598+107-3410+100-3357+106-2891+108-2467+112-2341+111-2333+299-2253+275-2076+98-1903+122-1790+99-1398+103-1315+326-1208+353-1068+363-885+316-443+382-147+311-99+291-66+102-49+104-43+269-24
|
|
255
|
+
|
|
256
|
+
lit:
|
|
257
|
+
name: Lithuanian
|
|
258
|
+
iso1: lt
|
|
259
|
+
source: bible/lithuanian/lithuanian_utf8.txt
|
|
260
|
+
size: 2622771
|
|
261
|
+
utf8: iaseoturnkmvjpdlšėgbyųžąįūęzčhcf
|
|
262
|
+
fingerprint: 105-12588+97-11808+115-8495+101-6142+111-5914+116-5394+117-5240+114-4783+110-4766+107-4222+109-3091+118-2924+106-2768+112-2691+100-2681+108-2648+353-2030+279-2010+103-1574+98-1407+121-1323+371-1076+382-868+261-867+303-693+363-600+281-358+122-350+269-327+104-170+99-102+102-73
|
|
263
|
+
|
|
264
|
+
mri:
|
|
265
|
+
name: Maori
|
|
266
|
+
iso1: mi
|
|
267
|
+
bibl: mao
|
|
268
|
+
source: bible/maori/maori_utf8.txt
|
|
269
|
+
size: 3363174
|
|
270
|
+
utf8: aioetknhurmgwp
|
|
271
|
+
fingerprint: 97-23163+105-11136+111-9086+101-9005+116-8452+107-7771+110-6216+104-5930+117-5343+114-5047+109-2832+103-2369+119-2119+112-1524
|
|
272
|
+
|
|
273
|
+
nor:
|
|
274
|
+
name: Norwegian
|
|
275
|
+
iso1: 'no' # a 'no' without quotes would be interpreted by yaml as false!
|
|
276
|
+
source: bible/norwegian/norwegian_utf8.txt
|
|
277
|
+
size: 2856143
|
|
278
|
+
utf8: erntsodaiglmkhvfuåbjøpyæ
|
|
279
|
+
fingerprint: 101-15294+114-7967+110-7421+116-6589+115-6369+111-6333+100-6281+97-6127+105-5439+103-4869+108-4793+109-3692+107-3266+104-2699+118-2435+102-2114+117-1589+229-1370+98-1275+106-1247+248-1022+112-908+121-600+230-271
|
|
280
|
+
|
|
281
|
+
por:
|
|
282
|
+
name: Portuguese
|
|
283
|
+
iso1: pt
|
|
284
|
+
source: corpora/por.portuguese.utf-8.txt
|
|
285
|
+
size: 3168005
|
|
286
|
+
utf8: eaosridmuntclpvhqfgbãáéjzçêíxóàõúôâ
|
|
287
|
+
fingerprint: 101-13242+97-11555+111-11259+115-9136+114-6758+105-5214+100-5105+109-4739+117-4552+110-4456+116-4133+99-3016+108-2716+112-2364+118-1604+104-1421+113-1133+102-1059+103-980+98-976+227-945+225-557+233-522+106-508+122-431+231-401+234-315+237-277+120-176+243-171+224-87+245-59+250-52+244-37+226-14
|
|
288
|
+
|
|
289
|
+
rom:
|
|
290
|
+
name: Romany
|
|
291
|
+
source: bible/romani/romani_utf8.txt
|
|
292
|
+
size: 799230
|
|
293
|
+
utf8: aeisnolkrdhtmupvgcbzwxjfy
|
|
294
|
+
fingerprint: 97-15760+101-11712+105-8702+115-7047+110-6918+111-6350+108-5689+107-4714+114-4495+100-4351+104-4228+116-3801+109-3363+117-3248+112-2191+118-2094+103-1348+99-1089+98-881+122-800+119-342+120-337+106-298+102-203+121-25
|
|
295
|
+
|
|
296
|
+
ron:
|
|
297
|
+
name: Romanian
|
|
298
|
+
iso1: ro
|
|
299
|
+
bibl: rum
|
|
300
|
+
source: bible/romanian_cornilescu/romanian_cornilescu_utf8.txt
|
|
301
|
+
size: 3094740
|
|
302
|
+
utf8: ieaunrtlcosămdpîşvţfzbghj
|
|
303
|
+
fingerprint: 105-11276+101-10539+97-9609+117-7142+110-6125+114-5922+116-5788+108-5203+99-4968+111-4701+115-4105+259-4091+109-3241+100-3140+112-3032+238-2352+351-1935+118-1626+355-1120+102-1118+122-826+98-809+103-650+104-412+106-256
|
|
304
|
+
|
|
305
|
+
rus:
|
|
306
|
+
name: Russian
|
|
307
|
+
iso1: ru
|
|
308
|
+
source: bible/russian_makarij/russian_makarij_utf8.txt
|
|
309
|
+
size: 566856
|
|
310
|
+
utf8: оиаеътснвлрдмкпугяыјбзьoйхжічшюцщфэneurbmsѕ
|
|
311
|
+
fingerprint: 1086-10340+1080-7173+1072-7021+1077-6793+1098-6003+1090-5408+1089-5200+1085-5145+1074-4808+1083-4060+1088-3637+1076-3224+1084-2822+1082-2588+1087-2517+1091-2306+1075-2133+1103-1933+1099-1777+1112-1775+1073-1708+1079-1584+1100-1447+111-1034+1081-1015+1093-982+1078-968+1110-926+1095-866+1096-743+1102-683+1094-407+1097-299+1092-131+1101-76+110-55+101-55+117-54+114-54+98-54+109-53+115-53+1109-50
|
|
312
|
+
|
|
313
|
+
srp:
|
|
314
|
+
name: Serbian (Croatian)
|
|
315
|
+
iso1: sr
|
|
316
|
+
source: bible/croatian/croatian_utf8.txt
|
|
317
|
+
size: 3019099
|
|
318
|
+
utf8: aoienjsutvrdmklpgzbšhčćcžđf
|
|
319
|
+
fingerprint: 97-11403+111-10019+105-9908+101-9465+110-5617+106-5456+115-4874+117-4284+116-4267+118-4171+114-4155+100-3750+109-3520+107-2984+108-2949+112-2448+103-1827+122-1773+98-1667+353-1306+104-1063+269-810+263-772+99-603+382-573+273-234+102-74
|
|
320
|
+
|
|
321
|
+
wen:
|
|
322
|
+
name: Sorbian languages
|
|
323
|
+
source: corpora/wen.sorbish.utf-8.txt
|
|
324
|
+
size: 6871623
|
|
325
|
+
utf8: aeojnwsitrkduhmpyclzběšćžłóźčřgfńövx
|
|
326
|
+
fingerprint: 97-9290+101-8335+111-8218+106-5718+110-5480+119-4786+115-4658+105-4410+116-3764+114-3688+107-3520+100-3407+117-3398+104-3206+109-3135+112-2454+121-2432+99-2274+108-2254+122-2246+98-2092+283-1717+353-1446+263-1436+382-1409+322-1277+243-937+378-920+269-758+345-589+103-190+102-170+324-157+246-135+118-35+120-11
|
|
327
|
+
|
|
328
|
+
spa:
|
|
329
|
+
name: Spanish - Castilian
|
|
330
|
+
iso1: es
|
|
331
|
+
source: corpora/spa.spanish-castilian.utf-8.txt
|
|
332
|
+
size: 2942875
|
|
333
|
+
utf8: eaosrnlditucmphbyvqjgíáóéfzñúx
|
|
334
|
+
fingerprint: 101-12863+97-11473+111-9525+115-8330+114-6464+110-5964+108-5434+100-5344+105-5127+116-4008+117-3998+99-3387+109-2532+112-2204+104-1907+98-1602+121-1420+118-1226+113-1126+106-1101+103-908+237-725+225-691+243-656+233-654+102-553+122-371+241-173+250-154+120-54
|
|
335
|
+
|
|
336
|
+
swe:
|
|
337
|
+
name: Swedish
|
|
338
|
+
iso1: sv
|
|
339
|
+
source: corpora/swe.swedish.utf-8.txt
|
|
340
|
+
size: 8249056
|
|
341
|
+
utf8: eartnsildomkgvfäuhpöåbcjyxwzé
|
|
342
|
+
fingerprint: 101-9738+97-9129+114-8656+116-8411+110-8358+115-6352+105-5781+108-5342+100-4280+111-4239+109-3526+107-3375+103-3194+118-2437+102-2090+228-1969+117-1947+104-1893+112-1873+246-1505+229-1502+98-1490+99-1342+106-664+121-616+120-148+119-70+122-22+233-21
|
|
343
|
+
|
|
344
|
+
tgl:
|
|
345
|
+
name: Tagalog
|
|
346
|
+
iso1: tl
|
|
347
|
+
source: bible/tagalog_1905/tagalog_1905_utf8.txt
|
|
348
|
+
size: 3919850
|
|
349
|
+
utf8: angisotkylmupbhdrwejcvzf
|
|
350
|
+
fingerprint: 97-25597+110-14169+103-9447+105-8807+115-5049+111-4691+116-4342+107-3870+121-3621+108-3431+109-3414+117-2605+112-2588+98-2038+104-1688+100-1389+114-1187+119-856+101-784+106-166+99-129+118-46+122-45+102-24
|
|
351
|
+
|
|
352
|
+
tha:
|
|
353
|
+
name: Thai
|
|
354
|
+
iso1: th
|
|
355
|
+
source: bible/thai_kjv/thai_kjv_utf8.txt
|
|
356
|
+
size: 3649596
|
|
357
|
+
utf8: านรอเ้ง่ะักยลมวทหขดพจีิคแตบสป์ไูใืช็โุoำึผถซญฮธnศษฟณภฉฝๆฏฐฤฆ๋
|
|
358
|
+
fingerprint: 3634-7583+3609-5349+3619-4959+3629-4737+3648-4704+3657-4604+3591-4553+3656-4263+3632-3122+3633-2912+3585-2895+3618-2716+3621-2557+3617-2522+3623-2515+3607-2387+3627-2189+3586-2141+3604-2069+3614-2053+3592-1864+3637-1801+3636-1792+3588-1752+3649-1742+3605-1715+3610-1574+3626-1532+3611-1329+3660-1277+3652-1212+3641-1183+3651-1024+3639-985+3594-926+3655-867+3650-717+3640-670+111-634+3635-592+3638-568+3612-481+3606-455+3595-316+3597-277+3630-253+3608-226+110-218+3624-213+3625-203+3615-146+3603-134+3616-92+3593-84+3613-78+3654-44+3599-31+3600-28+3620-27+3590-25+3659-23
|
|
359
|
+
|
|
360
|
+
tur:
|
|
361
|
+
name: Turkish
|
|
362
|
+
iso1: tr
|
|
363
|
+
source: corpora/tur.turkish.utf-8.txt
|
|
364
|
+
size: 8641602
|
|
365
|
+
utf8: aeirlnkıdtmsuyobüzgşvhcçpğöfİjäwâãxîå
|
|
366
|
+
fingerprint: 97-11237+101-9918+105-9269+114-7424+108-7280+110-7102+107-4467+305-3973+100-3944+116-3939+109-3746+115-3435+117-3064+121-2791+111-2486+98-2218+252-1781+122-1395+103-1363+351-1360+118-1209+104-981+99-971+231-935+112-923+287-904+246-800+102-555+304-162+106-110+228-42+119-37+226-31+227-30+120-28+238-26+229-13
|
|
367
|
+
|
|
368
|
+
ukr:
|
|
369
|
+
name: Ukrainian
|
|
370
|
+
iso1: uk
|
|
371
|
+
source: bible/ukrainian_1871/ukrainian_1871_utf8.txt
|
|
372
|
+
size: 590457
|
|
373
|
+
utf8: оаивнестірлдумпкбгяйьзїжхчnшющєцф
|
|
374
|
+
fingerprint: 1086-10730+1072-7105+1080-7022+1074-6059+1085-5214+1077-5160+1089-4861+1090-4777+1110-4394+1088-3756+1083-3408+1076-3343+1091-3200+1084-3064+1087-2590+1082-2519+1073-2342+1075-2219+1103-1908+1081-1860+1100-1814+1079-1769+1111-1416+1078-1401+1093-1381+1095-1361+110-1354+1096-1006+1102-892+1097-871+1108-675+1094-426+1092-57
|
|
375
|
+
|
|
376
|
+
ppk:
|
|
377
|
+
name: Uma
|
|
378
|
+
source: bible/uma/uma_utf8.txt
|
|
379
|
+
size: 969131
|
|
380
|
+
utf8: aoinutempklhrgswbdyjcfz
|
|
381
|
+
fingerprint: 97-20282+111-12108+105-9323+110-7106+117-6498+116-6239+101-5501+109-4900+112-4773+107-4011+108-3551+104-3397+114-3308+103-1694+115-1593+119-1532+98-1523+100-1362+121-522+106-410+99-309+102-31+122-16
|
|
382
|
+
|
|
383
|
+
vie:
|
|
384
|
+
name: Vietnamese
|
|
385
|
+
iso1: vi
|
|
386
|
+
source: bible/vietnamese_1934/vietnamese_1934_utf8.txt
|
|
387
|
+
size: 2877551
|
|
388
|
+
utf8: nhicgtaomđàrvưluysáôêbờkpóơếìứúạdấớðãủầềâeảộậxốẽựữởằùọợòịặíệồẻăqắừểỏẳéổũễụèửỡỗỉjẹẫýõĩẩ
|
|
389
|
+
fingerprint: 110-11578+104-7754+105-6648+99-6449+103-6109+116-5507+97-4372+111-2610+109-2518+273-2422+224-2385+114-2134+118-2083+432-2059+108-1976+117-1835+121-1744+115-1624+225-1455+244-1380+234-1340+98-1218+7901-1144+107-988+112-977+243-951+417-891+7871-802+236-738+7913-714+250-712+7841-666+100-657+7845-592+7899-563+240-542+227-535+7911-501+7847-498+7873-492+226-478+101-472+7843-429+7897-426+7853-408+120-404+7889-391+7869-349+7921-347+7919-340+7903-336+7857-318+249-312+7885-296+7907-291+242-260+7883-256+7863-241+237-236+7879-230+7891-228+7867-226+259-208+113-202+7855-198+7915-178+7875-177+7887-162+7859-147+233-140+7893-135+361-119+7877-103+7909-98+232-95+7917-93+7905-86+7895-86+7881-63+106-40+7865-40+7851-37+253-24+245-21+297-19+7849-19
|
|
390
|
+
|
|
391
|
+
wol:
|
|
392
|
+
name: Wolof
|
|
393
|
+
iso1: wo
|
|
394
|
+
source: bible/wolof/wolof_utf8.txt
|
|
395
|
+
size: 649171
|
|
396
|
+
utf8: aneioulmkydgbtsrwxcñjàéëpfóqŋ
|
|
397
|
+
fingerprint: 97-13971+110-9286+101-8574+105-6580+111-6182+117-5720+108-5145+109-4547+107-3911+121-3776+100-3602+103-3309+98-3259+116-3143+115-2484+114-2428+119-1966+120-1687+99-1652+241-1604+106-1345+224-1251+233-1080+235-1078+112-1017+102-994+243-265+113-88+331-41
|
|
398
|
+
|
|
399
|
+
xho:
|
|
400
|
+
name: Xhosa
|
|
401
|
+
iso1: xh
|
|
402
|
+
source: bible/xhosa/xhosa_utf8.txt
|
|
403
|
+
size: 3255471
|
|
404
|
+
utf8: aeniouklhbmywstzgdprvxfqcj
|
|
405
|
+
fingerprint: 97-14460+101-9449+110-9134+105-8849+111-6723+117-6621+107-5942+108-5088+104-4592+98-4123+109-3418+121-3272+119-3193+115-2913+116-2520+122-2335+103-2223+100-1717+112-737+114-510+118-493+120-440+102-390+113-365+99-273+106-207
|
|
406
|
+
|