langa 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. data/COPYING +674 -0
  2. data/README +69 -0
  3. data/bin/langa +169 -0
  4. data/examples/afrikaans_1953_utf8.txt +1000 -0
  5. data/examples/albanian_utf8.txt +1000 -0
  6. data/examples/amharic_utf8.txt +1000 -0
  7. data/examples/arabic_svd_utf8.txt +1000 -0
  8. data/examples/armenian_western_1853_utf8.txt +1000 -0
  9. data/examples/asv_utf8.txt +1000 -0
  10. data/examples/basque_1571_utf8.txt +1000 -0
  11. data/examples/breton_utf8.txt +1000 -0
  12. data/examples/chinese_ncv_s_utf8.txt +1000 -0
  13. data/examples/chinese_ncv_utf8.txt +1000 -0
  14. data/examples/chinese_union_s_utf8.txt +1000 -0
  15. data/examples/chinese_union_utf8.txt +1000 -0
  16. data/examples/coptic_nt_utf8.txt +1000 -0
  17. data/examples/croatian_utf8.txt +1000 -0
  18. data/examples/czech_bkr_utf8.txt +1000 -0
  19. data/examples/danish_utf8.txt +1000 -0
  20. data/examples/dutch_svv_utf8.txt +1000 -0
  21. data/examples/esperanto_utf8.txt +1000 -0
  22. data/examples/estonian_utf8.txt +1000 -0
  23. data/examples/finnish_pr_1992_utf8.txt +1000 -0
  24. data/examples/french_ostervald_1996_utf8.txt +1000 -0
  25. data/examples/german_schlachter_1951_utf8.txt +1000 -0
  26. data/examples/greek_byzantine_2000_utf8.txt +1000 -0
  27. data/examples/greek_modern_utf8.txt +1000 -0
  28. data/examples/hebrew_modern_utf8.txt +1000 -0
  29. data/examples/hungarian_karoli_utf8.txt +1000 -0
  30. data/examples/italian_riveduta_1927_utf8.txt +1000 -0
  31. data/examples/kabyle_nt_utf8.txt +1000 -0
  32. data/examples/kjv_apocrypha_utf8.txt +1000 -0
  33. data/examples/korean_utf8.txt +1000 -0
  34. data/examples/latin_vulgata_clementina_utf8.txt +1000 -0
  35. data/examples/latvian_nt_utf8.txt +1000 -0
  36. data/examples/lithuanian_utf8.txt +1000 -0
  37. data/examples/manx_gaelic_utf8.txt +1000 -0
  38. data/examples/maori_utf8.txt +1000 -0
  39. data/examples/myanmar_judson_1835_utf8.txt +1000 -0
  40. data/examples/norwegian_utf8.txt +1000 -0
  41. data/examples/peshitta_utf8.txt +1000 -0
  42. data/examples/portuguese_utf8.txt +1000 -0
  43. data/examples/romani_utf8.txt +1000 -0
  44. data/examples/romanian_cornilescu_utf8.txt +1000 -0
  45. data/examples/russian_makarij_utf8.txt +1000 -0
  46. data/examples/spanish_reina_valera_1909_utf8.txt +1000 -0
  47. data/examples/swedish_1917_utf8.txt +1000 -0
  48. data/examples/tagalog_1905_utf8.txt +1000 -0
  49. data/examples/thai_kjv_utf8.txt +1000 -0
  50. data/examples/turkish_nt_utf8.txt +1000 -0
  51. data/examples/turkish_utf8.txt +1000 -0
  52. data/examples/ukrainian_1871_utf8.txt +1000 -0
  53. data/examples/vietnamese_1934_utf8.txt +1000 -0
  54. data/examples/wolof_utf8.txt +1000 -0
  55. data/examples/xhosa_utf8.txt +1000 -0
  56. data/lib/langa.rb +35 -0
  57. data/lib/langa/dna.rb +209 -0
  58. data/lib/langa/file.rb +97 -0
  59. data/lib/langa/langa.dna +406 -0
  60. data/lib/langa/languageanalyzer.rb +134 -0
  61. data/lib/langa/languages.rb +147 -0
  62. data/lib/langa/randomtestfiles.rb +140 -0
  63. data/lib/langa/utilities.rb +53 -0
  64. data/test/tc_file.rb +47 -0
  65. data/test/tc_languages.rb +69 -0
  66. data/test/tc_utilities.rb +42 -0
  67. data/unicode/CaseFolding.txt +1065 -0
  68. data/unicode/CaseFolding.txt.webloc +8 -0
  69. data/unicode/Index of -Public-MAPPINGS.webloc b/data/unicode/Index of → -Public-MAPPINGS.webloc +0 -0
  70. data/unicode/mappings/8859-1.TXT +303 -0
  71. data/unicode/mappings/8859-10.TXT +303 -0
  72. data/unicode/mappings/8859-11.TXT +297 -0
  73. data/unicode/mappings/8859-13.TXT +299 -0
  74. data/unicode/mappings/8859-14.TXT +301 -0
  75. data/unicode/mappings/8859-15.TXT +303 -0
  76. data/unicode/mappings/8859-16.TXT +299 -0
  77. data/unicode/mappings/8859-2.TXT +303 -0
  78. data/unicode/mappings/8859-3.TXT +296 -0
  79. data/unicode/mappings/8859-4.TXT +303 -0
  80. data/unicode/mappings/8859-5.TXT +303 -0
  81. data/unicode/mappings/8859-6.TXT +260 -0
  82. data/unicode/mappings/8859-7.TXT +308 -0
  83. data/unicode/mappings/8859-8.TXT +270 -0
  84. data/unicode/mappings/8859-9.TXT +307 -0
  85. data/unicode/mappings/ATARIST.TXT +313 -0
  86. data/unicode/mappings/CP037.TXT +275 -0
  87. data/unicode/mappings/CP1006.TXT +302 -0
  88. data/unicode/mappings/CP1026.TXT +275 -0
  89. data/unicode/mappings/CP1250.TXT +274 -0
  90. data/unicode/mappings/CP1251.TXT +274 -0
  91. data/unicode/mappings/CP1252.TXT +274 -0
  92. data/unicode/mappings/CP1253.TXT +274 -0
  93. data/unicode/mappings/CP1254.TXT +274 -0
  94. data/unicode/mappings/CP1255.TXT +274 -0
  95. data/unicode/mappings/CP1256.TXT +274 -0
  96. data/unicode/mappings/CP1257.TXT +274 -0
  97. data/unicode/mappings/CP1258.TXT +274 -0
  98. data/unicode/mappings/CP424.TXT +304 -0
  99. data/unicode/mappings/CP437.TXT +274 -0
  100. data/unicode/mappings/CP500.TXT +275 -0
  101. data/unicode/mappings/CP737.TXT +274 -0
  102. data/unicode/mappings/CP775.TXT +275 -0
  103. data/unicode/mappings/CP850.TXT +274 -0
  104. data/unicode/mappings/CP852.TXT +274 -0
  105. data/unicode/mappings/CP855.TXT +275 -0
  106. data/unicode/mappings/CP856.TXT +303 -0
  107. data/unicode/mappings/CP857.TXT +275 -0
  108. data/unicode/mappings/CP860.TXT +275 -0
  109. data/unicode/mappings/CP861.TXT +275 -0
  110. data/unicode/mappings/CP862.TXT +275 -0
  111. data/unicode/mappings/CP863.TXT +275 -0
  112. data/unicode/mappings/CP864.TXT +275 -0
  113. data/unicode/mappings/CP865.TXT +275 -0
  114. data/unicode/mappings/CP866.TXT +275 -0
  115. data/unicode/mappings/CP869.TXT +275 -0
  116. data/unicode/mappings/CP874.TXT +274 -0
  117. data/unicode/mappings/CP875.TXT +275 -0
  118. data/unicode/mappings/KOI8-R.TXT +302 -0
  119. data/unicode/mappings/NEXTSTEP.TXT +173 -0
  120. data/unicode/mappings/ROMAN.TXT +275 -0
  121. data/unicode/mappings/US-ASCII-QUOTES.TXT +198 -0
  122. metadata +180 -0
@@ -0,0 +1,35 @@
1
+ =begin
2
+
3
+ Copyright © 2007 John Vorhauer
4
+ Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
5
+
6
+ This file is part of Langa.
7
+
8
+ Langa is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ Langa is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with Langa. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ For a detailed functional description of Langa see README file
22
+
23
+ =end
24
+
25
+ $:.unshift File.join(File.dirname(__FILE__), 'langa')
26
+
27
+ require 'rubygems'
28
+ require 'utilities'
29
+ require 'file'
30
+ require 'dna'
31
+ require 'languages'
32
+ require 'languageanalyzer'
33
+ require 'randomtestfiles'
34
+ require 'getoptlong'
35
+
@@ -0,0 +1,209 @@
1
+ =begin
2
+
3
+ Copyright © 2007 John Vorhauer
4
+ Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
5
+
6
+ This file is part of Langa.
7
+
8
+ Langa is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ Langa is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with Langa. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ For a detailed functional description of Langa see README file
22
+
23
+ =end
24
+
25
+ =begin
26
+
27
+ The class DNA creates a typical fingerprint from a unicode character stream.
28
+ This fingerprint can be compared with fingerprints of other test streams to
29
+ support an automatic language recognition.
30
+
31
+ The fingerprint is a statistical analysis of the frequency of occurance of
32
+ single characters. With the analysis non letter characters are filtered and
33
+ upper case letters are mapped to lowercase.
34
+
35
+ The distance between two fingerprints is measured in the sum of distances
36
+ between each single letter.
37
+
38
+ =end
39
+
40
+ class DNA
41
+
42
+ @@gene_map = Hash.new
43
+
44
+ # The gene map has two main purposes for all characters in the range
45
+ # of U+0000 to U+024F:
46
+ #
47
+ # 1. Map a unicode character from uppercase to lowercase
48
+ # 2. Filter relevant characters from punctuation and spacing characters
49
+ #
50
+ # For the first purpose the gene map relies on the CaseFolding.txt file
51
+ # from unicode.org. For the second purpose some relevant characters are
52
+ # mapped by hand, because they are not differentiated bewteen
53
+ # upper/lowercase. Grab latest version from
54
+ # http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
55
+
56
+ def DNA.fill_gene_map
57
+ # => find lokal CaseFolding.txt
58
+ case_fold = File.join(File.dirname(__FILE__), '..', '..', 'unicode', 'CaseFolding.txt')
59
+
60
+ # => load uppwer-/lowercase mappings
61
+ File.open(case_fold).each_line do |line|
62
+ # Line format looks like
63
+ # 0041; C; 0061; # LATIN CAPITAL LETTER A
64
+ code, stat, mapp = line.gsub(/ /, '').split(';')
65
+ if stat=='C' || stat=='S'
66
+ code, mapp = code.hex, mapp.hex
67
+ @@gene_map[code] = @@gene_map[mapp] = mapp
68
+ end
69
+ end
70
+
71
+ # complete mapping for use as legal character identification
72
+ [0x130, 0x131, 0x138, 0x149, 0x180, 0x18d, 0x19b, 0x1aa, 0x1ab, 0x1ba,
73
+ 0x1bb, 0x1be, 0x1f0, 0x221, 0x234, 0x235, 0x236, 0x237, 0x238, 0x239,
74
+ 0x23a, 0x23e, 0x23f, 0x240].each { |code| @@gene_map[code] = code }
75
+
76
+ end
77
+
78
+
79
+ # => dna_chain holds the dna while growing
80
+ @dna_chain = nil
81
+ @dna_size = 0
82
+
83
+ # => fingerprint is the final identifier of the dna
84
+ @fingerprint = nil
85
+
86
+
87
+ # Create a new DNA object. You can create a DNA object from an existing
88
+ # fingerprint.
89
+ # d = DNA.new
90
+ # d = DNA.new(fingerprint)
91
+ def initialize(*parm)
92
+ # => initialize class variable
93
+ @@gene_map.empty? && DNA.fill_gene_map
94
+
95
+ # => check parameters
96
+ case parm.size
97
+ when 0
98
+ @dna_chain = Hash.new(0)
99
+ when 1
100
+ if parm[0].is_a?(String)
101
+ # => create dna object from fingerprint
102
+ @fingerprint = Hash.new
103
+ parm[0].scan(/([^+-]+)-([^+-]+)/).each do |gene|
104
+ idx, @fingerprint[idx] = gene.collect {|var| var.to_i}
105
+ end
106
+ else
107
+ raise ArgumentError, "wrong type of argument (String expected)"
108
+ end
109
+ else
110
+ raise ArgumentError, "wrong number of argument (#{parm.size} for 0/1)"
111
+ end
112
+
113
+ end
114
+
115
+ # Add an unicode character to the dna chain. This can be done in precedence
116
+ # to calculating the dna fingerprint. If the fingerprint was already
117
+ # calculated, you have to reset the dna object, before you can add another
118
+ # character.
119
+ # add_gene(0x123)
120
+
121
+ def add_gene(unicode)
122
+ raise "fingerprint already calculated, try reset first" unless @fingerprint.nil?
123
+ if unicode > 0x0250
124
+ @dna_chain[unicode] += 1 unless unicode === (0x2b0..0x2af)
125
+ else
126
+ @dna_chain[@@gene_map[unicode]] += 1 if @@gene_map.has_key?(unicode)
127
+ end
128
+ @dna_size += 1
129
+ end
130
+
131
+ # With feed you can give complete files as an input to the dna. You
132
+ # must specify a codepage for input character conversion in precedence
133
+ # of dna calculations (preferably UTF-8). For codepage namings see class
134
+ # File.
135
+ # dna = DNA.new
136
+ # dna.feed('input-text', '8859-1')
137
+
138
+ def feed(filename, codepage)
139
+ self.reset
140
+ File.open([filename, codepage]).each_unicode {|uc| add_gene(uc) }
141
+ end
142
+
143
+ # The fingerprint is the significant extract of a file, which is essentially
144
+ # for the language recognition process.
145
+
146
+ def fingerprint
147
+ if @fingerprint.nil?
148
+ # => filter gene, that are least significant
149
+ filter = (@dna_chain.size > 1000) ? 100 : 10
150
+
151
+ # => check the length of the chain, i.e. number of characters
152
+ length = weight = 0
153
+ @dna_chain.each { |pair| length += pair[1] }
154
+ @size = length
155
+
156
+ # => normalize the frequence of characters
157
+ @fingerprint = @dna_chain.collect { |gene| char, freq = gene
158
+ weight = (freq * 100000.0 / length).to_i
159
+ (weight > filter) ? [char, weight] : nil
160
+ }.compact.sort {|a,b| b[1]<=>a[1]}
161
+ end
162
+ @fingerprint
163
+ end
164
+
165
+ # Calculate the distance bewteen two fingerprint to measure the equality.
166
+ # dna.distance(other_dna) -> distance
167
+ def distance(dna)
168
+ fp = dna.fingerprint
169
+ dst = 0
170
+ @fingerprint.each do |gene| char, freq = gene
171
+ dst += (fp.has_key?(char) ? (fp[char]-freq).abs : freq)
172
+ end
173
+ dst / 1000.0
174
+ end
175
+
176
+ # Reset the DNA object
177
+ def reset
178
+ @dna_chain.clear
179
+ @dna_size = 0
180
+ @fingerprint = nil
181
+ end
182
+
183
+ def size
184
+ @dna_size
185
+ end
186
+
187
+ # Convert the fingerprint to an UTF-8 string.
188
+ # dna.to_utf_8 -> 'enirtsadhlugcmobfkwzpvüäjöyxq'
189
+ def to_utf8
190
+ fingerprint.collect {|pair| pair[0]}.to_utf8
191
+ end
192
+
193
+ # Convert the fingerprint to a string.
194
+ # dna.to_s -> '101-16251+110-9918+105-7865+...'
195
+ def to_s
196
+ fingerprint.collect { |gene| gene.join('-') }.join('+')
197
+ end
198
+
199
+ # def distance_orig(reference)
200
+ # ref = reference.dup
201
+ # @fingerprint.each do |gene| char, count = gene
202
+ # ref.has_key?(char) && ref[char]=(ref[char]>count)?(ref[char]-count):0
203
+ # end
204
+ # dst = 0
205
+ # ref.each {|k,v| dst += v if k.is_a?(Numeric)}
206
+ # dst / 1000.0
207
+ # end
208
+
209
+ end
@@ -0,0 +1,97 @@
1
+ =begin
2
+
3
+ Copyright © 2007 John Vorhauer
4
+ Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
5
+
6
+ This file is part of Langa.
7
+
8
+ Langa is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ Langa is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with Langa. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ For a detailed functional description of Langa see README file
22
+
23
+ =end
24
+
25
+ =begin rdoc
26
+
27
+ Extend class File to convert characters from different codepages into an
28
+ unicode stream.
29
+
30
+ You can specify any codepage, that is listed in the directory
31
+ #{LANGA}/unicode/mappings (omit the '.txt' extension).
32
+ Grab the latest versions from http://www.unicode.org/Public/MAPPINGS
33
+
34
+ =end
35
+
36
+ class File
37
+ alias old_initialize initialize
38
+
39
+ # Specify a codepage with the new or open method. If no codepage is
40
+ # specified, UTF-8 will be assumed.
41
+ # file = File.open([filename, codepage], ...)
42
+ def initialize(*par)
43
+ self.codepage = 'utf-8'
44
+ self.codepage, par[0] = par[0].reverse if par[0].is_a?(Array)
45
+ if par[1] =~ /[wa]/
46
+ raise "mode not supported with codepage conversion"
47
+ end
48
+ old_initialize(*par)
49
+ end
50
+
51
+ # Specify a codepage after open and before processing the file.
52
+ # file.codepage = '8859-4'
53
+ def codepage=(cp)
54
+ @codepage = cp
55
+ unless utf8?
56
+ @code_map = Array.new(256, 0)
57
+ maps = File.join(File.dirname(__FILE__), '..', '..', 'unicode', 'mappings')
58
+ File.open(File.join(maps, "#{cp}.txt")).each_line do |line|
59
+ line.downcase =~ %r|^([0-9a-fx]+)\s([0-9a-fx]+)| &&
60
+ @code_map[$1.hex] = $2.hex
61
+ end
62
+ end
63
+ end
64
+
65
+ # Walk trought a file step by step for each unicode character.
66
+ # file.each_unicode { |unicode| ... }
67
+ def each_unicode
68
+ readlines.each do |line|
69
+ transcode(line).each do |char|
70
+ yield char
71
+ end
72
+ end
73
+ end
74
+
75
+ # True, if codepage of file is UTF-8
76
+ # file.utf8? -> true
77
+ def utf8?
78
+ @codepage == 'utf-8' || @codepage == 'utf8'
79
+ end
80
+
81
+
82
+ private
83
+
84
+ # Convert a string from codepage coding to unicode
85
+ def transcode(str)
86
+ unicode = nil
87
+ if utf8?
88
+ unicode = str.to_unicode
89
+ else
90
+ unicode = []
91
+ str.each_byte {|byte| unicode << @code_map[byte] }
92
+ end
93
+ unicode
94
+ end
95
+
96
+ end
97
+
@@ -0,0 +1,406 @@
1
+ ---
2
+ afk:
3
+ name: Afrikaans
4
+ iso1: af
5
+ source: bible/afrikaans_1953/afrikaans_1953_utf8.txt
6
+ size: 3232617
7
+ utf8: eaniodrstlghmuvkwybpjfêëéíáócäú
8
+ fingerprint: 101-17648+97-8725+110-8531+105-7167+111-6711+100-6026+114-5526+115-5517+116-5095+108-4392+103-3582+104-3129+109-2643+117-2532+118-2440+107-2391+119-2011+121-1553+98-1161+112-971+106-875+102-608+234-249+235-149+233-91+237-70+225-70+243-37+99-27+228-27+250-12
9
+
10
+ sqi:
11
+ name: Albanian
12
+ iso1: sq
13
+ bibl: alb
14
+ source: bible/albanian/albanian_utf8.txt
15
+ size: 3160948
16
+ utf8: ëteiarnhsodjumkplbgvqzfyçcx
17
+ fingerprint: 235-10285+116-9353+101-8926+105-7899+97-6944+114-6647+110-6168+104-4860+115-4544+111-4290+100-4128+106-3965+117-3677+109-3102+107-2718+112-2629+108-2115+98-1576+103-1378+118-1102+113-984+122-903+102-734+121-662+231-198+99-165+120-30
18
+
19
+ amh:
20
+ name: Amharic
21
+ iso1: am
22
+ source: bible/amharic/amharic_utf8.txt
23
+ size: 421502
24
+ utf8: ንምእአበርትውስለልየ።ይነnመደናተችሁያከግድወብላ፥ሰገህታሉሱራማረ፤ቀባሆዚቸክሚጠዘቅፈዳሳጋዲጥፍሞቶጣሩቱቃዋሕሥሔኢካሙኛዱሎኝዛዎጌኑዝሌሊኔጊኋዜሮኖዓቢፋዙጅቡቆቁሲፊጡጽጸኃሪጉሐቤቻዕሣሃሄጀዩሴዶጢዮጻሬቹጂፉሶጆሽጎዴጦጭኳሠጨኞሸፎቴሻኤዊኩሜኪጳቂዬቲጴኵኅቼፅጁዞጮኘጫቦሦዖኙኸቈቍጐቄኮጃጹሾኒፃሡፀሂሹጲጄቋፁፌሺጓጕጶጤኬ
25
+ fingerprint: 4757-6355+4637-4061+4773-3399+4768-3041+4704-2965+4653-2895+4725-2761+4813-2712+4661-2659+4616-2598+4621-2255+4840-2158+4962-2007+4845-1999+4752-1919+110-1819+4632-1798+4848-1770+4755-1716+4720-1658+4733-1614+4609-1577+4843-1421+4776-1417+4877-1391+4853-1359+4808-1355+4709-1352+4619-1309+4965-1242+4656-1154+4872-1143+4613-1020+4723-908+4617-804+4657-803+4651-787+4635-761+4648-732+4964-731+4672-729+4707-712+4614-692+4826-663+4728-615+4781-594+4634-555+4896-548+4824-520+4677-519+4936-518+4851-505+4659-483+4875-467+4850-443+4901-442+4941-432+4638-417+4726-414+4899-398+4649-350+4721-350+4675-323+4811-317+4629-315+4645-307+4628-293+4770-285+4779-278+4633-261+4763-245+4849-241+4622-241+4765-237+4827-226+4814-226+4876-220+4753-219+4829-217+4620-217+4618-216+4756-216+4874-214+4747-212+4828-210+4654-208+4758-204+4819-190+4706-187+4939-180+4825-173+4869-167+4705-166+4678-156+4673-153+4658-148+4938-147+4897-141+4925-139+4920-137+4739-130+4650-125+4873-124+4624-121+4708-118+4731-116+4821-116+4643-108+4611-106+4612-106+4864-105+4841-98+4660-98+4854-97+4898-95+4846-94+4923-93+4652-93+4729-87+4866-86+4937-84+4662-83+4870-83+4669-82+4878-77+4852-75+4902-74+4909-71+4787-71+4640-71+4904-69+4766-63+4664-62+4942-61+4724-60+4667-60+4772-60+4810-58+4777-53+4636-53+4778-47+4915-47+4674-44+4844-42+4722-42+4916-42+4789-42+4741-41+4732-39+4933-39+4865-39+4830-35+4910-35+4760-33+4907-33+4710-32+4646-31+4822-29+4761-29+4792-28+4680-27+4685-27+4880-27+4676-27+4782-26+4867-25+4921-25+4670-25+4754-24+4931-22+4641-21+4928-20+4610-19+4665-15+4914-15+4868-15+4683-14+4929-14+4940-13+4666-13+4883-12+4885-12+4918-11+4900-11+4780-11
26
+
27
+ ara:
28
+ name: Arabic
29
+ iso1: ar
30
+ source: bible/arabic_svd/arabic_svd_utf8.txt
31
+ size: 1913503
32
+ utf8: اليومنربهعتكفسدقحجةoذشخىصطضئثnّزأء‎غآظؤ
33
+ fingerprint: 1575-16041+1604-10781+1610-7450+1608-6617+1605-6376+1606-5991+1585-4336+1576-4169+1607-3668+1593-3248+1578-3233+1603-3135+1601-2570+1587-2422+1583-2315+1602-1971+1581-1551+1580-1375+1577-1346+111-1210+1584-1180+1588-947+1582-918+1609-914+1589-730+1591-686+1590-639+1574-561+1579-532+110-416+1617-397+1586-390+1571-388+1569-355+8206-284+1594-282+1570-232+1592-199+1572-102
34
+
35
+ arc:
36
+ name: Aramaic
37
+ source: bible/peshitta/peshitta_utf8.txt
38
+ size: 477091
39
+ utf8: ܐܘܢܝܠܕܡܗܬܪܒܟܫܥܚ܀nܩܦܤܓܛܙܨ
40
+ fingerprint: 1808-13524+1816-9865+1826-9307+1821-8743+1824-7178+1813-6238+1825-6166+1815-5318+1836-5069+1834-4171+1810-4138+1823-2870+1835-2697+1829-2512+1818-2194+1792-1670+110-1670+1833-1408+1830-1395+1828-1242+1811-864+1819-792+1817-605+1832-315
41
+
42
+ hye:
43
+ name: Armenian
44
+ iso1: hy
45
+ bibl: arm
46
+ source: bible/armenian_western_1853/armenian_western_1853_utf8.txt
47
+ size: 692147
48
+ utf8: անորեիւսմտէկըnյքցլհպբթվդզղչծգխջռձշօփճժ
49
+ fingerprint: 1377-12429+1398-8854+1400-8031+1408-7801+1381-7001+1387-5885+1410-5729+1405-3496+1396-3272+1407-3135+1383-3093+1391-2705+1384-2505+110-2303+1397-2089+1412-2078+1409-1706+1388-1624+1392-1595+1402-1336+1378-1210+1385-1193+1406-1192+1380-1170+1382-979+1394-950+1401-949+1390-948+1379-870+1389-563+1403-558+1404-552+1393-538+1399-538+1413-458+1411-264+1395-198+1386-155
50
+
51
+ eus:
52
+ name: Basque
53
+ iso1: eu
54
+ bibl: baq
55
+ source: bible/basque_1571/basque_1571_utf8.txt
56
+ size: 844751
57
+ utf8: aenitrucodhbzsglçmpqéákvyfäóxè
58
+ fingerprint: 97-14547+101-13695+110-9531+105-9261+116-7328+114-7126+117-6146+99-5510+111-3435+100-3112+104-2606+98-2375+122-2283+115-2253+103-2237+108-2061+231-1871+109-1010+112-926+113-459+233-424+225-365+107-332+118-319+121-310+102-290+228-66+243-63+120-17+232-17
59
+
60
+ bre:
61
+ name: Breton
62
+ iso1: br
63
+ source: bible/breton/breton_utf8.txt
64
+ size: 330669
65
+ utf8: eanorthduzlisvgmkbcpñwjfùyê
66
+ fingerprint: 101-15077+97-14962+110-8028+111-6448+114-6201+116-5021+104-4400+100-4203+117-3992+122-3937+108-3719+105-3679+115-3144+118-2904+103-2422+109-2406+107-2054+98-1431+99-1287+112-1263+241-1262+119-613+106-604+102-343+249-333+121-153+234-88
67
+
68
+ mya:
69
+ name: Burmese
70
+ iso1: my
71
+ source: bible/myanmar_judson_1835/myanmar_judson_1835_utf8.txt
72
+ size: 3822858
73
+ utf8: ်ာုိေးသကငမတ့ရြအှညနလပစ၊ျူခ။ါွ၏ထယဘံoီဖ၍ဆဝဟဲ၌္ဒnဇဗ၎ဂဉဣဤဏဧဥဦဃဌဩဓဋ
74
+ fingerprint: 4154-10974+4140-6153+4143-5815+4141-5348+4145-4864+4152-4767+4126-4634+4096-4496+4100-3964+4121-3680+4112-3479+4151-3092+4123-2771+4156-2713+4129-2235+4158-2221+4106-2142+4116-2105+4124-2051+4117-2041+4101-1533+4170-1387+4155-1387+4144-1323+4097-1286+4171-1156+4139-1092+4157-1070+4175-890+4113-887+4122-881+4120-776+4150-646+111-602+4142-561+4118-540+4173-520+4102-501+4125-441+4127-423+4146-406+4172-347+4153-244+4114-228+110-207+4103-195+4119-160+4174-117+4098-102+4105-102+4131-81+4132-57+4111-51+4135-49+4133-46+4134-31+4099-21+4108-18+4137-15+4115-14+4107-12
75
+
76
+ cat:
77
+ name: Catalan - Valencian
78
+ iso1: ca
79
+ source: corpora/cat.catalan-valencian.utf-8.txt
80
+ size: 29853676
81
+ utf8: easirtlnocdumpbvgqfóhxéàjèíyòzçúkïwüáñ
82
+ fingerprint: 101-12764+97-11745+115-8078+105-7578+114-6928+116-6488+108-6450+110-6369+111-5009+99-4517+100-4211+117-3806+109-3024+112-2879+98-1316+118-1239+103-1236+113-1025+102-990+243-673+104-506+120-482+233-432+224-408+106-330+232-270+237-260+121-226+242-202+122-170+231-97+250-93+107-38+239-36+119-35+252-31+225-23+241-11
83
+
84
+ zho:
85
+ name: Chinese
86
+ iso1: zh
87
+ source: bible/chinese_union/chinese_union_utf8.txt
88
+ size: 1085910
89
+ utf8: ,的。o他你我們人在和是耶說;不要以就有:n為了子一華這所、上來那必亞大神裡到因中地從去利列都也王使?見得兒行將色給又與事作可著出 約前日之對西個撒樣時!看如麼祭心自並下面用拉話能知城主把拿眾同道叫基沒當起向雅十聽安手被羅巴法比門穌求路呢各天住己民若罪百切候家生多無甚女死長此名像頭回過於摩好猶─聖惡發瑪)衛(物後但告伯國二打受哈離些卻年父
90
+ fingerprint: 65292-5348+30340-4498+12290-2763+111-2132+20182-2080+20320-1837+25105-1804+20497-1761+20154-1585+22312-1207+21644-1148+26159-1123+32822-967+35498-877+65307-874+19981-868+35201-768+20197-760+23601-759+26377-758+65306-755+110-734+28858-732+20102-728+23376-697+19968-663+33775-649+36889-637+25152-618+12289-584+19978-537+20358-533+37027-515+24517-488+20126-457+22823-451+31070-438+35041-437+21040-434+22240-415+20013-404+22320-401+24478-372+21435-360+21033-350+21015-349+37117-346+20063-346+29579-321+20351-309+65311-302+35211-299+24471-295+20818-294+34892-282+23559-279+33394-275+32102-266+21448-262+33287-253+20107-249+20316-248+21487-247+33879-245+20986-241+12288-240+32004-235+21069-233+26085-229+20043-229+23565-227+35199-225+20491-225+25746-224+27171-221+26178-220+65281-217+30475-213+22914-208+40636-203+31085-202+24515-198+33258-196+20006-196+19979-194+38754-190+29992-185+25289-182+35441-181+33021-181+30693-180+22478-178+20027-176+25226-175+25343-175+30526-174+21516-173+36947-172+21483-170+22522-170+27794-169+30070-169+36215-168+21521-168+38597-164+21313-164+32893-163+23433-161+25163-157+34987-154+32645-154+24052-153+27861-152+27604-152+38272-151+31308-151+27714-150+36335-149+21602-148+21508-147+22825-147+20303-146+24049-146+27665-146+33509-144+32618-144+30334-144+20999-143+20505-143+23478-142+29983-142+22810-139+28961-139+29978-135+22899-135+27515-135+38263-135+27492-134+21517-133+20687-132+38957-132+22238-130+36942-129+26044-127+25705-121+22909-120+29494-120+9472-120+32854-119+24801-118+30332-118+29802-118+65289-116+34907-116+65288-115+29289-113+24460-112+20294-111+21578-110+20271-109+22283-108+20108-108+25171-108+21463-105+21704-105+38626-104+20123-102+21371-102+24180-102+29238-101
91
+
92
+ cop:
93
+ name: Coptic
94
+ source: bible/coptic_nt/coptic_nt_utf8.txt
95
+ size: 664953
96
+ utf8: ⲉⲛⲟⲁⲓⲩⲧⲙⲥⲡⲣϩⲱⲏϥϫϣⲃⲗϯⲕnϧⲫⲑⲇⲭⲅϭⲍⲝⲯ
97
+ fingerprint: 11401-12762+11419-10528+11423-9195+11393-7820+11411-6787+11433-5630+11431-4919+11417-4897+11429-3451+11425-3360+11427-3216+1001-3098+11441-2910+11407-2817+997-2590+1003-2199+995-1585+11395-1523+11415-1498+1007-1380+11413-1257+110-1198+999-1164+11435-1131+11409-1097+11399-700+11437-457+11397-425+1005-245+11405-65+11421-35+11439-22
98
+
99
+ ces:
100
+ name: Czech
101
+ iso1: cs
102
+ source: bible/czech_bkr/czech_bkr_utf8.txt
103
+ size: 2809145
104
+ utf8: oeanitslvdmkuphjríbcyzážěřéšýčůťfgúňďó
105
+ fingerprint: 111-8776+101-7924+97-6779+110-5508+105-5311+116-5023+115-4821+108-4572+118-4374+100-3864+109-3463+107-3334+117-3032+112-2867+104-2820+106-2801+114-2637+237-2475+98-2316+99-2020+121-1956+122-1923+225-1873+382-1823+283-1532+345-1287+233-1263+353-1161+253-752+269-602+367-528+357-200+102-108+103-87+250-50+328-49+271-47+243-21
106
+
107
+ dan:
108
+ name: Danish
109
+ iso1: da
110
+ source: corpora/dan.danish.utf-8.txt
111
+ size: 7401657
112
+ utf8: erntisdalogmkfvubphåæøjycwxzé
113
+ fingerprint: 101-15862+114-8668+110-7473+116-6906+105-6145+115-6078+100-6022+97-5793+108-5316+111-4664+103-4365+109-3256+107-3241+102-2533+118-2467+117-1842+98-1617+112-1577+104-1515+229-1055+230-900+248-845+106-663+121-627+99-400+119-63+120-33+122-24+233-17
114
+
115
+ nld:
116
+ name: Dutch-Flemish
117
+ iso1: nl
118
+ bibl: dut
119
+ source: corpora/dut.dutch-flemish.utf-8.txt
120
+ size: 8406113
121
+ utf8: enaitrodslgvmhkubpcjwzfyxéëqï
122
+ fingerprint: 101-18458+110-9769+97-7554+105-6763+116-6684+114-6457+111-5899+100-5385+115-4258+108-3927+103-2953+118-2517+109-2434+104-2325+107-2315+117-2020+98-1620+112-1618+99-1597+106-1457+119-1447+122-1185+102-914+121-139+120-74+233-73+235-67+113-24+239-12
123
+
124
+ eng:
125
+ name: English
126
+ iso1: en
127
+ source: bible/asv/asv_utf8.txt
128
+ size: 3266058
129
+ utf8: ethaonisrdlfumwygcbpvkjzxq
130
+ fingerprint: 101-12583+116-9593+104-8828+97-8648+111-8203+110-7144+105-5921+115-5798+114-5003+100-4626+108-3743+102-2565+117-2534+109-2438+119-1979+121-1810+103-1683+99-1608+98-1491+112-1310+118-1138+107-695+106-479+122-93+120-45+113-29
131
+
132
+ epo:
133
+ name: Esperanto
134
+ iso1: eo
135
+ source: bible/esperanto/esperanto_utf8.txt
136
+ size: 3071751
137
+ utf8: aioenlsrtjkudmxpvgcfbhz
138
+ fingerprint: 97-11947+105-10309+111-9333+101-7953+110-7419+108-6675+115-5944+114-5536+116-4501+106-4394+107-4239+117-3721+100-3246+109-2658+120-2595+112-2102+118-1935+103-1694+99-1122+102-1038+98-831+104-459+122-337
139
+
140
+ est:
141
+ name: Estonian
142
+ iso1: et
143
+ bibl: est
144
+ source: corpora/est.estonian.utf-8.txt
145
+ size: 27559980
146
+ utf8: aeistlunkodmrvgpjhäõbüöfczyw
147
+ fingerprint: 97-11797+101-11304+105-9947+115-8971+116-7745+108-6161+117-5919+110-4908+107-4772+111-4035+100-3822+109-3687+114-3214+118-2375+103-1843+112-1748+106-1614+104-1547+228-1298+245-1176+98-835+252-711+246-281+102-156+99-52+122-21+121-19+119-16
148
+
149
+ fin:
150
+ name: Finnish
151
+ iso1: fi
152
+ bibl: fin
153
+ source: corpora/fin.finnish.utf-8.txt
154
+ size: 9661950
155
+ utf8: aitenslokuämrvjyhpdögfbcwx
156
+ fingerprint: 97-11837+105-10520+116-10114+101-8269+110-8145+115-7750+108-5634+111-5566+107-5304+117-5173+228-3914+109-3198+114-2463+118-2447+106-2053+121-1888+104-1818+112-1807+100-949+246-627+103-178+102-94+98-90+99-76+119-35+120-15
157
+
158
+ fra:
159
+ name: French
160
+ iso1: fr
161
+ bibl: fre
162
+ source: corpora/fre.french.utf-8.txt
163
+ size: 10752039
164
+ utf8: esanirtulodcpmévfgqbhàxjèyêzkôçwîâùû
165
+ fingerprint: 101-14208+115-8057+97-7848+110-7520+105-7294+114-7002+116-6876+117-5681+108-5559+111-5385+100-4181+99-3406+112-2988+109-2694+233-2545+118-1354+102-1120+103-1041+113-921+98-897+104-822+224-472+120-421+106-420+232-351+121-285+234-138+122-102+107-102+244-62+231-59+119-36+238-34+226-32+249-25+251-21
166
+
167
+ gla:
168
+ name: Gaelic
169
+ iso1: gd
170
+ source: bible/manx_gaelic/manx_gaelic_utf8.txt
171
+ size: 388891
172
+ utf8: eayhnsroidgltcmuvjbfkpwqz
173
+ fingerprint: 101-13343+97-10207+121-8944+104-8379+110-8263+115-7064+114-7023+111-6238+105-5135+100-3790+103-3702+108-3610+116-3227+99-2262+109-2065+117-1628+118-1438+106-925+98-843+102-512+107-490+112-452+119-345+113-70+122-29
174
+
175
+ deu:
176
+ name: German
177
+ iso1: de
178
+ bibl: ger
179
+ source: corpora/ger.german.utf-8.txt
180
+ size: 92273185
181
+ utf8: enirtsadhlugcmobfkwzpvüäjöyxq
182
+ fingerprint: 101-16251+110-9918+105-7865+114-7637+116-6348+115-6297+97-5912+100-4806+104-4285+108-3760+117-3715+103-3023+99-2757+109-2724+111-2721+98-2076+102-1755+107-1501+119-1432+122-1238+112-1031+118-936+252-675+228-586+106-276+246-264+121-109+120-55+113-26
183
+
184
+ ell:
185
+ name: Greek - modern (1453-)
186
+ iso1: el
187
+ bibl: gre
188
+ source: bible/greek_modern/greek_modern_utf8.txt
189
+ size: 3400574
190
+ utf8: αοεισντυηκρπλωμδθγβχoφξnζψ
191
+ fingerprint: 945-11466+959-9820+949-9687+953-9421+963-7862+957-7585+964-7395+965-5894+951-3929+954-3673+961-3439+960-2932+955-2881+969-2826+956-2444+948-1953+952-1845+947-1183+946-739+967-690+111-681+966-653+958-376+110-234+950-215+968-163
192
+
193
+ heb:
194
+ name: Hebrew
195
+ iso1: he
196
+ source: bible/hebrew_modern/hebrew_modern_utf8.txt
197
+ size: 1688847
198
+ utf8: יוהאלרבתשמםענכדח׃oקןפךצגזסטnץף
199
+ fingerprint: 1497-11007+1493-10527+1492-8213+1488-8045+1500-7084+1512-5330+1489-5099+1514-5066+1513-4607+1502-4605+1501-3594+1506-3513+1504-3203+1499-2811+1491-2600+1495-2209+1475-1841+111-1371+1511-1286+1503-1254+1508-1237+1498-981+1510-876+1490-826+1494-705+1505-662+1496-529+110-472+1509-231+1507-191
200
+
201
+ hun:
202
+ name: Hungarian
203
+ iso1: hu
204
+ source: bible/hungarian_karoli/hungarian_karoli_utf8.txt
205
+ size: 3096513
206
+ utf8: eatnklsézoimárgdyvhbjõföpuóíúücû
207
+ fingerprint: 101-10749+97-9182+116-7349+110-6455+107-5688+108-5655+115-5611+233-4416+122-4350+111-4278+105-4173+109-4031+225-3472+114-3330+103-3232+100-2684+121-1874+118-1852+104-1528+98-1516+106-1367+245-1049+102-1038+246-1010+112-832+117-737+243-694+237-462+250-427+252-411+99-411+251-120
208
+
209
+ ita:
210
+ name: Italian
211
+ iso1: it
212
+ source: corpora/ita.italian.utf-8.txt
213
+ size: 29036444
214
+ utf8: aieontrlscdpumgvfhzbqèàùìkyéxjw
215
+ fingerprint: 97-11456+105-11392+101-10791+111-9152+110-7348+116-6805+114-6609+108-6405+115-5037+99-4399+100-3728+112-2900+117-2848+109-2558+103-1723+118-1548+102-1053+104-1031+122-990+98-952+113-417+232-293+224-273+249-81+236-45+107-37+121-29+233-27+120-22+106-18+119-18
216
+
217
+ jpn:
218
+ name: Japanese
219
+ iso1: ja
220
+ source: corpora/jp.japan.utf-8.txt
221
+ size: 3767346
222
+ utf8: の、。たにはをいしとがるでてなーかっれらもンり」「1日すうさ2まこ0スだト年きルめ)(け人どく大3会んイや一あえッ中同ラ5本よ出市リつ月ア4ク・国者上合ち回わ後ド前タ場行長6事ロ十時手地そみ発内7二プ分シ見打今8県レせ開フカ9高生業ム目勝間戦メ定ジ約入部チ自テ三子対員方選バ点学マグ社コ田民連お決度全ば用サ表議最体通新立明げ調町的実
223
+ fingerprint: 12398-3249+12289-2721+12290-2649+12383-2239+12395-2097+12399-1965+12434-1961+12356-1779+12375-1687+12392-1634+12364-1632+12427-1613+12391-1589+12390-1321+12394-1218+12540-1153+12363-889+12387-882+12428-803+12425-770+12418-692+12531-684+12426-639+12301-635+12300-629+65297-599+26085-595+12377-586+12358-503+12373-484+65298-478+12414-458+12371-455+65296-453+12473-448+12384-436+12488-425+24180-402+12365-398+12523-398+12417-382+65289-381+65288-381+12369-379+20154-379+12393-375+12367-375+22823-331+65299-329+20250-326+12435-321+12452-318+12420-304+19968-288+12354-278+12360-277+12483-273+20013-272+21516-270+12521-257+65301-253+26412-249+12424-248+20986-245+24066-244+12522-242+12388-242+26376-242+12450-239+65300-238+12463-229+12539-224+22269-224+32773-220+19978-218+21512-206+12385-206+22238-204+12431-200+24460-196+12489-194+21069-194+12479-193+22580-192+34892-189+38263-188+65302-187+20107-187+12525-185+21313-185+26178-185+25163-182+22320-180+12381-177+12415-176+30330-174+20869-172+65303-171+20108-170+12503-169+20998-168+12471-163+35211-162+25171-161+20170-160+65304-160+30476-159+12524-159+12379-158+38283-157+12501-157+12459-156+65305-156+39640-155+29983-155+26989-154+12512-152+30446-151+21213-150+38291-149+25126-148+12513-146+23450-144+12472-144+32004-143+20837-142+37096-141+12481-141+33258-138+12486-137+19977-136+23376-135+23550-133+21729-132+26041-130+36984-129+12496-125+28857-125+23398-123+12510-122+12464-121+31038-121+12467-121+30000-120+27665-120+36899-119+12362-117+27770-115+24230-114+20840-114+12400-113+29992-113+12469-112+34920-110+35696-110+26368-109+20307-109+36890-108+26032-105+31435-104+26126-103+12370-102+35519-102+30010-101+30340-101+23455-101
224
+
225
+ kab:
226
+ name: Kabyle
227
+ source: bible/kabyle_nt/kabyle_nt_utf8.txt
228
+ size: 735419
229
+ utf8: eanidtrslmuykwbɣgɛhcfqzxj
230
+ fingerprint: 101-12692+97-12480+110-10358+105-9610+100-6033+116-5337+114-5028+115-4991+108-4443+109-4330+117-3265+121-3018+107-2625+119-2600+98-2070+611-1886+103-1804+603-1300+104-1257+99-1131+102-1101+113-1094+122-783+120-604+106-144
231
+
232
+ kor:
233
+ name: Korean
234
+ iso1: ko
235
+ source: corpora/kr.korean.utf-8.txt
236
+ size: 15874880
237
+ utf8: 이다의는에을고지한로가대기하사은서정를도자부해으시인국일있수제장원전들나리과보주것상어라아적했경관회동만그위공구당문등조중계개선성화무면소게비업여신명우통민내안방유와세실미연할재교년치거금산러진용스행발단영체백간모되오분학김려천난현야입차된마표요생결반며역력운터후없니각건말법직설권출강북식본물데않합히임속월때최외남었작까청노씨예종처달불심점두호바군르감양따총령련울던협책받파추평판음태급포근석트특환집날또타드증매찰배
238
+ fingerprint: 51060-3170+45796-2505+51032-2171+45716-2114+50640-1940+51012-1746+44256-1581+51648-1431+54620-1404+47196-1394+44032-1342+45824-1242+44592-1219+54616-1215+49324-1139+51008-1138+49436-1051+51221-975+47484-970+46020-945+51088-920+48512-858+54644-847+51004-813+49884-804+51064-793+44397-754+51068-749+51080-747+49688-734+51228-698+51109-693+50896-666+51204-653+46308-653+45208-621+47532-612+44284-602+48372-590+51452-578+44163-561+49345-561+50612-557+46972-528+50500-520+51201-494+54664-493+44221-477+44288-472+54924-466+46041-459+47564-448+44536-446+50948-431+44277-419+44396-419+45817-418+47928-412+46321-412+51312-410+51473-396+44228-388+44060-384+49440-381+49457-368+54868-365+47924-364+47732-356+49548-350+44172-343+48708-342+50629-336+50668-332+49888-330+47749-330+50864-329+53685-329+48124-327+45236-316+50504-306+48169-305+50976-294+50752-293+49464-292+49892-291+48120-290+50672-288+54624-280+51116-280+44368-279+45380-277+52824-277+44144-273+44552-270+49328-267+47084-265+51652-265+50857-261+49828-258+54665-251+48156-248+45800-248+50689-236+52404-234+48177-232+44036-230+47784-230+46104-228+50724-223+48516-223+54617-216+44608-210+47140-207+52380-207+45212-206+54788-206+50556-204+51077-203+52264-202+46108-201+47560-200+54364-199+50836-198+49373-198+44208-196+48152-194+47728-194+50669-193+47141-192+50868-188+53552-188+54980-187+50630-187+45768-185+44033-182+44148-181+47568-181+48277-181+51649-181+49444-179+44428-177+52636-175+44053-172+48513-172+49885-172+48376-169+47932-169+45936-169+50506-169+54633-169+55176-164+51076-162+49549-162+50900-160+46412-157+52572-154+50808-153+45224-152+50632-152+51089-150+44620-150+52397-146+45432-145+50472-145+50696-145+51333-144+52376-144+45804-143+48520-142+49900-142+51216-141+46160-141+54840-138+48148-134+44400-133+47476-132+44048-132+50577-132+46384-131+52509-130+47161-130+47144-130+50872-123+45912-122+54801-122+52293-117+48155-117+54028-116+52628-115+54217-114+54032-114+51020-114+53468-113+44553-111+54252-109+44540-109+49437-108+53944-107+53945-104+54872-104+51665-103+45216-103+46608-103+53440-103+46300-102+51613-102+47588-102+52272-102+48176-102
239
+
240
+ lat:
241
+ name: Latin
242
+ iso1: la
243
+ source: bible/latin_vulgata_clementina/latin_vulgata_clementina_utf8.txt
244
+ size: 3326239
245
+ utf8: eitusanormcdlpbvqgfhæjxyëzœ
246
+ fingerprint: 101-11692+105-11144+116-8911+117-8124+115-7571+97-7497+110-6565+111-5996+114-5791+109-5633+99-3509+100-3344+108-2704+112-2312+98-1557+118-1459+113-1377+103-1035+102-938+104-840+230-688+106-563+120-455+121-105+235-89+122-56+339-27
247
+
248
+ lav:
249
+ name: Latvian
250
+ iso1: lv
251
+ source: bible/latvian_nt/latvian_nt_utf8.txt
252
+ size: 652159
253
+ utf8: aisenturmāvkdjlpoīēbzcgņšūļžķģfhč
254
+ fingerprint: 97-10928+105-9618+115-8594+101-6433+110-6162+116-6119+117-5874+114-3788+109-3666+257-3638+118-3598+107-3410+100-3357+106-2891+108-2467+112-2341+111-2333+299-2253+275-2076+98-1903+122-1790+99-1398+103-1315+326-1208+353-1068+363-885+316-443+382-147+311-99+291-66+102-49+104-43+269-24
255
+
256
+ lit:
257
+ name: Lithuanian
258
+ iso1: lt
259
+ source: bible/lithuanian/lithuanian_utf8.txt
260
+ size: 2622771
261
+ utf8: iaseoturnkmvjpdlšėgbyųžąįūęzčhcf
262
+ fingerprint: 105-12588+97-11808+115-8495+101-6142+111-5914+116-5394+117-5240+114-4783+110-4766+107-4222+109-3091+118-2924+106-2768+112-2691+100-2681+108-2648+353-2030+279-2010+103-1574+98-1407+121-1323+371-1076+382-868+261-867+303-693+363-600+281-358+122-350+269-327+104-170+99-102+102-73
263
+
264
+ mri:
265
+ name: Maori
266
+ iso1: mi
267
+ bibl: mao
268
+ source: bible/maori/maori_utf8.txt
269
+ size: 3363174
270
+ utf8: aioetknhurmgwp
271
+ fingerprint: 97-23163+105-11136+111-9086+101-9005+116-8452+107-7771+110-6216+104-5930+117-5343+114-5047+109-2832+103-2369+119-2119+112-1524
272
+
273
+ nor:
274
+ name: Norwegian
275
+ iso1: 'no' # a 'no' without quotes would be interpreted by yaml as false!
276
+ source: bible/norwegian/norwegian_utf8.txt
277
+ size: 2856143
278
+ utf8: erntsodaiglmkhvfuåbjøpyæ
279
+ fingerprint: 101-15294+114-7967+110-7421+116-6589+115-6369+111-6333+100-6281+97-6127+105-5439+103-4869+108-4793+109-3692+107-3266+104-2699+118-2435+102-2114+117-1589+229-1370+98-1275+106-1247+248-1022+112-908+121-600+230-271
280
+
281
+ por:
282
+ name: Portuguese
283
+ iso1: pt
284
+ source: corpora/por.portuguese.utf-8.txt
285
+ size: 3168005
286
+ utf8: eaosridmuntclpvhqfgbãáéjzçêíxóàõúôâ
287
+ fingerprint: 101-13242+97-11555+111-11259+115-9136+114-6758+105-5214+100-5105+109-4739+117-4552+110-4456+116-4133+99-3016+108-2716+112-2364+118-1604+104-1421+113-1133+102-1059+103-980+98-976+227-945+225-557+233-522+106-508+122-431+231-401+234-315+237-277+120-176+243-171+224-87+245-59+250-52+244-37+226-14
288
+
289
+ rom:
290
+ name: Romany
291
+ source: bible/romani/romani_utf8.txt
292
+ size: 799230
293
+ utf8: aeisnolkrdhtmupvgcbzwxjfy
294
+ fingerprint: 97-15760+101-11712+105-8702+115-7047+110-6918+111-6350+108-5689+107-4714+114-4495+100-4351+104-4228+116-3801+109-3363+117-3248+112-2191+118-2094+103-1348+99-1089+98-881+122-800+119-342+120-337+106-298+102-203+121-25
295
+
296
+ ron:
297
+ name: Romanian
298
+ iso1: ro
299
+ bibl: rum
300
+ source: bible/romanian_cornilescu/romanian_cornilescu_utf8.txt
301
+ size: 3094740
302
+ utf8: ieaunrtlcosămdpîşvţfzbghj
303
+ fingerprint: 105-11276+101-10539+97-9609+117-7142+110-6125+114-5922+116-5788+108-5203+99-4968+111-4701+115-4105+259-4091+109-3241+100-3140+112-3032+238-2352+351-1935+118-1626+355-1120+102-1118+122-826+98-809+103-650+104-412+106-256
304
+
305
+ rus:
306
+ name: Russian
307
+ iso1: ru
308
+ source: bible/russian_makarij/russian_makarij_utf8.txt
309
+ size: 566856
310
+ utf8: оиаеътснвлрдмкпугяыјбзьoйхжічшюцщфэneurbmsѕ
311
+ fingerprint: 1086-10340+1080-7173+1072-7021+1077-6793+1098-6003+1090-5408+1089-5200+1085-5145+1074-4808+1083-4060+1088-3637+1076-3224+1084-2822+1082-2588+1087-2517+1091-2306+1075-2133+1103-1933+1099-1777+1112-1775+1073-1708+1079-1584+1100-1447+111-1034+1081-1015+1093-982+1078-968+1110-926+1095-866+1096-743+1102-683+1094-407+1097-299+1092-131+1101-76+110-55+101-55+117-54+114-54+98-54+109-53+115-53+1109-50
312
+
313
+ srp:
314
+ name: Serbian (Croatian)
315
+ iso1: sr
316
+ source: bible/croatian/croatian_utf8.txt
317
+ size: 3019099
318
+ utf8: aoienjsutvrdmklpgzbšhčćcžđf
319
+ fingerprint: 97-11403+111-10019+105-9908+101-9465+110-5617+106-5456+115-4874+117-4284+116-4267+118-4171+114-4155+100-3750+109-3520+107-2984+108-2949+112-2448+103-1827+122-1773+98-1667+353-1306+104-1063+269-810+263-772+99-603+382-573+273-234+102-74
320
+
321
+ wen:
322
+ name: Sorbian languages
323
+ source: corpora/wen.sorbish.utf-8.txt
324
+ size: 6871623
325
+ utf8: aeojnwsitrkduhmpyclzběšćžłóźčřgfńövx
326
+ fingerprint: 97-9290+101-8335+111-8218+106-5718+110-5480+119-4786+115-4658+105-4410+116-3764+114-3688+107-3520+100-3407+117-3398+104-3206+109-3135+112-2454+121-2432+99-2274+108-2254+122-2246+98-2092+283-1717+353-1446+263-1436+382-1409+322-1277+243-937+378-920+269-758+345-589+103-190+102-170+324-157+246-135+118-35+120-11
327
+
328
+ spa:
329
+ name: Spanish - Castilian
330
+ iso1: es
331
+ source: corpora/spa.spanish-castilian.utf-8.txt
332
+ size: 2942875
333
+ utf8: eaosrnlditucmphbyvqjgíáóéfzñúx
334
+ fingerprint: 101-12863+97-11473+111-9525+115-8330+114-6464+110-5964+108-5434+100-5344+105-5127+116-4008+117-3998+99-3387+109-2532+112-2204+104-1907+98-1602+121-1420+118-1226+113-1126+106-1101+103-908+237-725+225-691+243-656+233-654+102-553+122-371+241-173+250-154+120-54
335
+
336
+ swe:
337
+ name: Swedish
338
+ iso1: sv
339
+ source: corpora/swe.swedish.utf-8.txt
340
+ size: 8249056
341
+ utf8: eartnsildomkgvfäuhpöåbcjyxwzé
342
+ fingerprint: 101-9738+97-9129+114-8656+116-8411+110-8358+115-6352+105-5781+108-5342+100-4280+111-4239+109-3526+107-3375+103-3194+118-2437+102-2090+228-1969+117-1947+104-1893+112-1873+246-1505+229-1502+98-1490+99-1342+106-664+121-616+120-148+119-70+122-22+233-21
343
+
344
+ tgl:
345
+ name: Tagalog
346
+ iso1: tl
347
+ source: bible/tagalog_1905/tagalog_1905_utf8.txt
348
+ size: 3919850
349
+ utf8: angisotkylmupbhdrwejcvzf
350
+ fingerprint: 97-25597+110-14169+103-9447+105-8807+115-5049+111-4691+116-4342+107-3870+121-3621+108-3431+109-3414+117-2605+112-2588+98-2038+104-1688+100-1389+114-1187+119-856+101-784+106-166+99-129+118-46+122-45+102-24
351
+
352
+ tha:
353
+ name: Thai
354
+ iso1: th
355
+ source: bible/thai_kjv/thai_kjv_utf8.txt
356
+ size: 3649596
357
+ utf8: านรอเ้ง่ะักยลมวทหขดพจีิคแตบสป์ไูใืช็โุoำึผถซญฮธnศษฟณภฉฝๆฏฐฤฆ๋
358
+ fingerprint: 3634-7583+3609-5349+3619-4959+3629-4737+3648-4704+3657-4604+3591-4553+3656-4263+3632-3122+3633-2912+3585-2895+3618-2716+3621-2557+3617-2522+3623-2515+3607-2387+3627-2189+3586-2141+3604-2069+3614-2053+3592-1864+3637-1801+3636-1792+3588-1752+3649-1742+3605-1715+3610-1574+3626-1532+3611-1329+3660-1277+3652-1212+3641-1183+3651-1024+3639-985+3594-926+3655-867+3650-717+3640-670+111-634+3635-592+3638-568+3612-481+3606-455+3595-316+3597-277+3630-253+3608-226+110-218+3624-213+3625-203+3615-146+3603-134+3616-92+3593-84+3613-78+3654-44+3599-31+3600-28+3620-27+3590-25+3659-23
359
+
360
+ tur:
361
+ name: Turkish
362
+ iso1: tr
363
+ source: corpora/tur.turkish.utf-8.txt
364
+ size: 8641602
365
+ utf8: aeirlnkıdtmsuyobüzgşvhcçpğöfİjäwâãxîå
366
+ fingerprint: 97-11237+101-9918+105-9269+114-7424+108-7280+110-7102+107-4467+305-3973+100-3944+116-3939+109-3746+115-3435+117-3064+121-2791+111-2486+98-2218+252-1781+122-1395+103-1363+351-1360+118-1209+104-981+99-971+231-935+112-923+287-904+246-800+102-555+304-162+106-110+228-42+119-37+226-31+227-30+120-28+238-26+229-13
367
+
368
+ ukr:
369
+ name: Ukrainian
370
+ iso1: uk
371
+ source: bible/ukrainian_1871/ukrainian_1871_utf8.txt
372
+ size: 590457
373
+ utf8: оаивнестірлдумпкбгяйьзїжхчnшющєцф
374
+ fingerprint: 1086-10730+1072-7105+1080-7022+1074-6059+1085-5214+1077-5160+1089-4861+1090-4777+1110-4394+1088-3756+1083-3408+1076-3343+1091-3200+1084-3064+1087-2590+1082-2519+1073-2342+1075-2219+1103-1908+1081-1860+1100-1814+1079-1769+1111-1416+1078-1401+1093-1381+1095-1361+110-1354+1096-1006+1102-892+1097-871+1108-675+1094-426+1092-57
375
+
376
+ ppk:
377
+ name: Uma
378
+ source: bible/uma/uma_utf8.txt
379
+ size: 969131
380
+ utf8: aoinutempklhrgswbdyjcfz
381
+ fingerprint: 97-20282+111-12108+105-9323+110-7106+117-6498+116-6239+101-5501+109-4900+112-4773+107-4011+108-3551+104-3397+114-3308+103-1694+115-1593+119-1532+98-1523+100-1362+121-522+106-410+99-309+102-31+122-16
382
+
383
+ vie:
384
+ name: Vietnamese
385
+ iso1: vi
386
+ source: bible/vietnamese_1934/vietnamese_1934_utf8.txt
387
+ size: 2877551
388
+ utf8: nhicgtaomđàrvưluysáôêbờkpóơếìứúạdấớðãủầềâeảộậxốẽựữởằùọợòịặíệồẻăqắừểỏẳéổũễụèửỡỗỉjẹẫýõĩẩ
389
+ fingerprint: 110-11578+104-7754+105-6648+99-6449+103-6109+116-5507+97-4372+111-2610+109-2518+273-2422+224-2385+114-2134+118-2083+432-2059+108-1976+117-1835+121-1744+115-1624+225-1455+244-1380+234-1340+98-1218+7901-1144+107-988+112-977+243-951+417-891+7871-802+236-738+7913-714+250-712+7841-666+100-657+7845-592+7899-563+240-542+227-535+7911-501+7847-498+7873-492+226-478+101-472+7843-429+7897-426+7853-408+120-404+7889-391+7869-349+7921-347+7919-340+7903-336+7857-318+249-312+7885-296+7907-291+242-260+7883-256+7863-241+237-236+7879-230+7891-228+7867-226+259-208+113-202+7855-198+7915-178+7875-177+7887-162+7859-147+233-140+7893-135+361-119+7877-103+7909-98+232-95+7917-93+7905-86+7895-86+7881-63+106-40+7865-40+7851-37+253-24+245-21+297-19+7849-19
390
+
391
+ wol:
392
+ name: Wolof
393
+ iso1: wo
394
+ source: bible/wolof/wolof_utf8.txt
395
+ size: 649171
396
+ utf8: aneioulmkydgbtsrwxcñjàéëpfóqŋ
397
+ fingerprint: 97-13971+110-9286+101-8574+105-6580+111-6182+117-5720+108-5145+109-4547+107-3911+121-3776+100-3602+103-3309+98-3259+116-3143+115-2484+114-2428+119-1966+120-1687+99-1652+241-1604+106-1345+224-1251+233-1080+235-1078+112-1017+102-994+243-265+113-88+331-41
398
+
399
+ xho:
400
+ name: Xhosa
401
+ iso1: xh
402
+ source: bible/xhosa/xhosa_utf8.txt
403
+ size: 3255471
404
+ utf8: aeniouklhbmywstzgdprvxfqcj
405
+ fingerprint: 97-14460+101-9449+110-9134+105-8849+111-6723+117-6621+107-5942+108-5088+104-4592+98-4123+109-3418+121-3272+119-3193+115-2913+116-2520+122-2335+103-2223+100-1717+112-737+114-510+118-493+120-440+102-390+113-365+99-273+106-207
406
+