langa 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. data/COPYING +674 -0
  2. data/README +69 -0
  3. data/bin/langa +169 -0
  4. data/examples/afrikaans_1953_utf8.txt +1000 -0
  5. data/examples/albanian_utf8.txt +1000 -0
  6. data/examples/amharic_utf8.txt +1000 -0
  7. data/examples/arabic_svd_utf8.txt +1000 -0
  8. data/examples/armenian_western_1853_utf8.txt +1000 -0
  9. data/examples/asv_utf8.txt +1000 -0
  10. data/examples/basque_1571_utf8.txt +1000 -0
  11. data/examples/breton_utf8.txt +1000 -0
  12. data/examples/chinese_ncv_s_utf8.txt +1000 -0
  13. data/examples/chinese_ncv_utf8.txt +1000 -0
  14. data/examples/chinese_union_s_utf8.txt +1000 -0
  15. data/examples/chinese_union_utf8.txt +1000 -0
  16. data/examples/coptic_nt_utf8.txt +1000 -0
  17. data/examples/croatian_utf8.txt +1000 -0
  18. data/examples/czech_bkr_utf8.txt +1000 -0
  19. data/examples/danish_utf8.txt +1000 -0
  20. data/examples/dutch_svv_utf8.txt +1000 -0
  21. data/examples/esperanto_utf8.txt +1000 -0
  22. data/examples/estonian_utf8.txt +1000 -0
  23. data/examples/finnish_pr_1992_utf8.txt +1000 -0
  24. data/examples/french_ostervald_1996_utf8.txt +1000 -0
  25. data/examples/german_schlachter_1951_utf8.txt +1000 -0
  26. data/examples/greek_byzantine_2000_utf8.txt +1000 -0
  27. data/examples/greek_modern_utf8.txt +1000 -0
  28. data/examples/hebrew_modern_utf8.txt +1000 -0
  29. data/examples/hungarian_karoli_utf8.txt +1000 -0
  30. data/examples/italian_riveduta_1927_utf8.txt +1000 -0
  31. data/examples/kabyle_nt_utf8.txt +1000 -0
  32. data/examples/kjv_apocrypha_utf8.txt +1000 -0
  33. data/examples/korean_utf8.txt +1000 -0
  34. data/examples/latin_vulgata_clementina_utf8.txt +1000 -0
  35. data/examples/latvian_nt_utf8.txt +1000 -0
  36. data/examples/lithuanian_utf8.txt +1000 -0
  37. data/examples/manx_gaelic_utf8.txt +1000 -0
  38. data/examples/maori_utf8.txt +1000 -0
  39. data/examples/myanmar_judson_1835_utf8.txt +1000 -0
  40. data/examples/norwegian_utf8.txt +1000 -0
  41. data/examples/peshitta_utf8.txt +1000 -0
  42. data/examples/portuguese_utf8.txt +1000 -0
  43. data/examples/romani_utf8.txt +1000 -0
  44. data/examples/romanian_cornilescu_utf8.txt +1000 -0
  45. data/examples/russian_makarij_utf8.txt +1000 -0
  46. data/examples/spanish_reina_valera_1909_utf8.txt +1000 -0
  47. data/examples/swedish_1917_utf8.txt +1000 -0
  48. data/examples/tagalog_1905_utf8.txt +1000 -0
  49. data/examples/thai_kjv_utf8.txt +1000 -0
  50. data/examples/turkish_nt_utf8.txt +1000 -0
  51. data/examples/turkish_utf8.txt +1000 -0
  52. data/examples/ukrainian_1871_utf8.txt +1000 -0
  53. data/examples/vietnamese_1934_utf8.txt +1000 -0
  54. data/examples/wolof_utf8.txt +1000 -0
  55. data/examples/xhosa_utf8.txt +1000 -0
  56. data/lib/langa.rb +35 -0
  57. data/lib/langa/dna.rb +209 -0
  58. data/lib/langa/file.rb +97 -0
  59. data/lib/langa/langa.dna +406 -0
  60. data/lib/langa/languageanalyzer.rb +134 -0
  61. data/lib/langa/languages.rb +147 -0
  62. data/lib/langa/randomtestfiles.rb +140 -0
  63. data/lib/langa/utilities.rb +53 -0
  64. data/test/tc_file.rb +47 -0
  65. data/test/tc_languages.rb +69 -0
  66. data/test/tc_utilities.rb +42 -0
  67. data/unicode/CaseFolding.txt +1065 -0
  68. data/unicode/CaseFolding.txt.webloc +8 -0
  69. data/unicode/Index of -Public-MAPPINGS.webloc b/data/unicode/Index of → -Public-MAPPINGS.webloc +0 -0
  70. data/unicode/mappings/8859-1.TXT +303 -0
  71. data/unicode/mappings/8859-10.TXT +303 -0
  72. data/unicode/mappings/8859-11.TXT +297 -0
  73. data/unicode/mappings/8859-13.TXT +299 -0
  74. data/unicode/mappings/8859-14.TXT +301 -0
  75. data/unicode/mappings/8859-15.TXT +303 -0
  76. data/unicode/mappings/8859-16.TXT +299 -0
  77. data/unicode/mappings/8859-2.TXT +303 -0
  78. data/unicode/mappings/8859-3.TXT +296 -0
  79. data/unicode/mappings/8859-4.TXT +303 -0
  80. data/unicode/mappings/8859-5.TXT +303 -0
  81. data/unicode/mappings/8859-6.TXT +260 -0
  82. data/unicode/mappings/8859-7.TXT +308 -0
  83. data/unicode/mappings/8859-8.TXT +270 -0
  84. data/unicode/mappings/8859-9.TXT +307 -0
  85. data/unicode/mappings/ATARIST.TXT +313 -0
  86. data/unicode/mappings/CP037.TXT +275 -0
  87. data/unicode/mappings/CP1006.TXT +302 -0
  88. data/unicode/mappings/CP1026.TXT +275 -0
  89. data/unicode/mappings/CP1250.TXT +274 -0
  90. data/unicode/mappings/CP1251.TXT +274 -0
  91. data/unicode/mappings/CP1252.TXT +274 -0
  92. data/unicode/mappings/CP1253.TXT +274 -0
  93. data/unicode/mappings/CP1254.TXT +274 -0
  94. data/unicode/mappings/CP1255.TXT +274 -0
  95. data/unicode/mappings/CP1256.TXT +274 -0
  96. data/unicode/mappings/CP1257.TXT +274 -0
  97. data/unicode/mappings/CP1258.TXT +274 -0
  98. data/unicode/mappings/CP424.TXT +304 -0
  99. data/unicode/mappings/CP437.TXT +274 -0
  100. data/unicode/mappings/CP500.TXT +275 -0
  101. data/unicode/mappings/CP737.TXT +274 -0
  102. data/unicode/mappings/CP775.TXT +275 -0
  103. data/unicode/mappings/CP850.TXT +274 -0
  104. data/unicode/mappings/CP852.TXT +274 -0
  105. data/unicode/mappings/CP855.TXT +275 -0
  106. data/unicode/mappings/CP856.TXT +303 -0
  107. data/unicode/mappings/CP857.TXT +275 -0
  108. data/unicode/mappings/CP860.TXT +275 -0
  109. data/unicode/mappings/CP861.TXT +275 -0
  110. data/unicode/mappings/CP862.TXT +275 -0
  111. data/unicode/mappings/CP863.TXT +275 -0
  112. data/unicode/mappings/CP864.TXT +275 -0
  113. data/unicode/mappings/CP865.TXT +275 -0
  114. data/unicode/mappings/CP866.TXT +275 -0
  115. data/unicode/mappings/CP869.TXT +275 -0
  116. data/unicode/mappings/CP874.TXT +274 -0
  117. data/unicode/mappings/CP875.TXT +275 -0
  118. data/unicode/mappings/KOI8-R.TXT +302 -0
  119. data/unicode/mappings/NEXTSTEP.TXT +173 -0
  120. data/unicode/mappings/ROMAN.TXT +275 -0
  121. data/unicode/mappings/US-ASCII-QUOTES.TXT +198 -0
  122. metadata +180 -0
@@ -0,0 +1,134 @@
1
+ =begin
2
+
3
+ Copyright © 2007 John Vorhauer
4
+ Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
5
+
6
+ This file is part of Langa.
7
+
8
+ Langa is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ Langa is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with Langa. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ For a detailed functional description of Langa see README file
22
+
23
+ =end
24
+
25
+ require 'rubygems'
26
+ require 'languages'
27
+
28
+ =begin rdoc
29
+
30
+ The class LanguageAnalyzer is the heart of Langa. It has two main use
31
+ cases:
32
+
33
+ = Recognize the language of a textfile
34
+ In this mode the LanguageAnalyzer identifies the language of a textfile by
35
+ comparing the fingerprint of the textfile against the ones documented in
36
+ the language configuration file 'language.dna'. Call
37
+
38
+ la = LanguageAnalyzer.new
39
+ la.analyze_file('german-file') -> 'deu'
40
+
41
+ If you wish additional codepage conversion for the input file call
42
+
43
+ la.analyze_file('german-file-iso-8859-1', '8859-1') -> 'deu'
44
+
45
+ = Create a new language fingerprint
46
+ If you have a big textfile of a previously unknown language, you can
47
+ calculate the fingerprint of this language and add it to the language
48
+ configuration file 'language.dna'. Call
49
+
50
+ la = LanguageAnalyzer.new
51
+ la.scan_language_dna('landir/*')
52
+
53
+ to scan all files from the landir directory. To automatically identify
54
+ the iso 639 language codes and the codepage that should be used for reading,
55
+ name the input files in a form '<iso-code>.<Language>.<codepage>.txt', i.e.
56
+ 'landir/deu.German.utf-8.txt'.
57
+
58
+ =end
59
+
60
+ class LanguageAnalyzer
61
+
62
+ # Create a new instance of the LanguageAnalyzer
63
+ # la = LanguageAnalyzer.new
64
+ def initialize(language_file='language.dna')
65
+ @languages = Languages.new(language_file)
66
+ end
67
+
68
+ # Get the keys of all known languages.
69
+ # la.keys -> ['deu', 'eng', ...]
70
+ def keys
71
+ @languages.keys.sort
72
+ end
73
+
74
+ # Get the keys of all known languages.
75
+ # la.config('deu') -> {'name'=>'German', 'iso1'=>'de', ...}
76
+ def config(key)
77
+ @languages.config(key)
78
+ end
79
+
80
+ # Get the source files of all known languages.
81
+ # la.sources -> ["corpora/ger.german.utf-8.txt", ...]
82
+ def sources
83
+ @languages.values_for('source').keys
84
+ end
85
+
86
+ # Analyze the language of a file. With the +full_detail+ toggle you
87
+ # can get a complete protokoll of teh analysis.
88
+ # la.analyze_file('german-file-utf8') -> 'deu'
89
+ # la.analyze('german-file-iso-8859-1', '8859-1') -> 'deu'
90
+ def analyze( filename, codepage='utf-8', full_detail=false )
91
+ dna = DNA.new
92
+ dna.feed(filename, codepage)
93
+ fp = dna.fingerprint
94
+
95
+ lang_score = Array.new
96
+ @languages.keys.each do |key|
97
+ lang = @languages.config(key)
98
+ lang_score << [dna.distance(lang['dna']), key, lang['name']]
99
+ end
100
+ full_detail ? lang_score.sort {|a,b| a[1]<=>b[1]} : lang_score.sort[0][1]
101
+ end
102
+
103
+ # Create a new dna fingerprint for a big language file. The file should have
104
+ # at least 100.000 letters. The more, the better for the quality of the
105
+ # fingerprint and therefor for the quality of language recognition.
106
+ # To scan all files from a directory, use a wildcard. To automatically
107
+ # identify the iso 639 language codes and the codepage that should be used
108
+ # for reading, name the input files in a form
109
+ # '<iso-code>.<Language>.<codepage>.txt', i.e. 'landir/deu.German.utf-8.txt'.
110
+ # la.scan_language_dna('landir/*')
111
+ # Copy the output to to the language configuration file 'language.dna'.
112
+ def scan_language_dna( pattern = '*', codepage = 'utf-8' )
113
+ lang, language, cp = nil, nil, codepage
114
+ Dir[ pattern ].each do |filename|
115
+ # filename =~ %r|/([^\.]+)\.([^\.]+)\.([^\.]+)|
116
+ # lang, language, cp = $1, $2, $3
117
+
118
+ dna = DNA.new
119
+ dna.feed(filename, cp)
120
+
121
+ puts Languages.to_paste('<iso 639-3 code>', {
122
+ 'name' => '<full language name>',
123
+ 'iso1' => '<iso 639-1 code (optional)>',
124
+ 'source' => filename,
125
+ 'size' => dna.size,
126
+ 'utf8' => dna.to_utf8,
127
+ 'fingerprint' => dna.to_s })
128
+ end
129
+ end
130
+
131
+ end
132
+
133
+ #la = LanguageAnalyzer.new
134
+ #p la.sources
@@ -0,0 +1,147 @@
1
+ =begin
2
+
3
+ Copyright © 2007 John Vorhauer
4
+ Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
5
+
6
+ This file is part of Langa.
7
+
8
+ Langa is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ Langa is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with Langa. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ For a detailed functional description of Langa see README file
22
+
23
+ =end
24
+
25
+ require 'rubygems'
26
+ require 'yaml'
27
+ require 'dna'
28
+
29
+ =begin rdoc
30
+ The class Languages handles attributes of different languages, particularly
31
+ the dna fingerprint used for language recognition.
32
+
33
+ The attributes for each language are stored in a yaml files of the form:
34
+
35
+ <three letter iso 639-3 language code>:
36
+ name: <name of the language>
37
+ iso1: <two letter iso 639-1 language code (optional)>
38
+ bibl: <three letter iso 639-2 bibliographic code (optional)>
39
+ source: <source file used for fingerprint creation>
40
+ size: <number of relevant characters for fingerprint creation>
41
+ utf8: <utf-8 representation of fingerprint>
42
+ fingerprint: <dna fingerprint of language>
43
+
44
+ i.e. this is shown for the german language
45
+
46
+ deu:
47
+ name: German
48
+ iso1: de
49
+ bibl: ger
50
+ source: corpora/ger.german.utf-8.txt
51
+ size: 92273185
52
+ utf8: enirtsadhlugcmobfkwzpvüäjöyxq
53
+ fingerprint: 101-16251+110-9918+105-7865+114-7637+116-6348...
54
+
55
+ For ISO 639-x codes see http://www.sil.org/ISO639-3/codes.asp
56
+
57
+ =end
58
+
59
+ class Languages
60
+
61
+ @languages = nil
62
+
63
+ # Create the YAML representation of a language configuration for
64
+ # manually pasting this to the language configuration file.
65
+ # Languages.to_paste('deu', {...}) -> "deu:\n name: German\n ..."
66
+ def Languages.to_paste(key, config, indent=4)
67
+ ind = ' ' * indent
68
+ cnf = config.dup
69
+ str = "#{key}:\n"
70
+ ['name','iso1','bibl','source','size','utf8','fingerprint'].each do |key|
71
+ if cnf.has_key?(key)
72
+ str << "%s%s:%s%s\n" % [ind, key,
73
+ ind[(key.size+1).modulo(indent)..indent-1], cnf[key]]
74
+ cnf.delete(key)
75
+ end
76
+ end
77
+ cnf.each do |key, value|
78
+ str << "%s%s:%s%s\n" % [ind, key,
79
+ ind[(key.size+1).modulo(indent)..indent-1], value]
80
+ end
81
+ str
82
+ end
83
+
84
+ # Create a Language object to query language attributes
85
+ # la = Languages.new('language.dna')
86
+ def initialize(config_file)
87
+ @languages = load_language_configuration(config_file)
88
+ self
89
+ end
90
+
91
+ # Get the keys of all known languages. The keys are named according to
92
+ # ISO 639-3
93
+ # la.keys -> ['deu', 'eng', ...]
94
+ def keys
95
+ @languages.keys.sort
96
+ end
97
+
98
+ # Get the values of all languages for given attribute +name+. With +lcase+
99
+ # you can force the results to be lowercased.
100
+ # la.values_for('name') -> {'German'=>'deu', 'English'=>'eng', ...}
101
+ # la.values_for('name', true) -> {'german'=>'deu', 'english'=>'eng', ...}
102
+ def values_for(name, lcase=false)
103
+ return nil if name.nil?
104
+
105
+ result = Hash.new
106
+ @languages.each do |key, val|
107
+ unless val[name].nil?
108
+ result[lcase ? val[name].downcase : val[name]] = key
109
+ end
110
+ end
111
+ result
112
+ end
113
+
114
+ # Get the complete configuration for a specific language. You can use any
115
+ # ISO 639 shortcut as a key (i.e. for german you can use 'deu', 'de' and
116
+ # 'ger')
117
+ # la.config('deu') -> {"name"=>"German", "iso1"=>"de", "bibl"=>"ger", ...}
118
+ def config(key)
119
+ # => tranlate key, unless present
120
+ unless @languages.has_key?(key)
121
+ map = values_for('iso1')
122
+ unless map.has_key?(key)
123
+ map = values_for('bibl')
124
+ unless map.has_key?(key)
125
+ map = values_for('name', true)
126
+ key = nil unless map.has_key?(key)
127
+ end
128
+ end
129
+ key = map[key] unless key.nil?
130
+ end
131
+
132
+ key.nil? ? nil : @languages[key]
133
+ end
134
+
135
+
136
+ private
137
+
138
+ def load_language_configuration(filename)
139
+ YAML.load(File.open(filename)).each_value do |lang|
140
+ lang['dna'] = DNA.new(lang['fingerprint'])
141
+ end
142
+ end
143
+
144
+ end
145
+
146
+ #la = Languages.new
147
+ #p la.values_for('name', true)
@@ -0,0 +1,140 @@
1
+ =begin
2
+
3
+ Copyright © 2007 John Vorhauer
4
+ Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
5
+
6
+ This file is part of Langa.
7
+
8
+ Langa is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ Langa is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with Langa. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ For a detailed functional description of Langa see README file
22
+
23
+ =end
24
+
25
+ require 'rubygems'
26
+
27
+ require 'fileutils'
28
+ require 'languages'
29
+
30
+ =begin
31
+
32
+ The class RandomTestFiles creates various files in different languages for
33
+ testing purposes. Therefor it scans the language configuration for source
34
+ files and, if existing, extracts a portion from random position of it to an
35
+ extra file in a testing directory. The files are removed, when object life
36
+ cycle ends.
37
+
38
+ =end
39
+
40
+ class RandomTestFiles
41
+
42
+ RTF_DIR = 'rltf.temp'.freeze
43
+
44
+ def initialize(file_count, chunk_size, &block)
45
+ # Store test parameter
46
+ @count = file_count
47
+ @size = chunk_size
48
+
49
+ # remember root path
50
+ @root = File.join(File.dirname(__FILE__), '..', '..')
51
+ @rtf_dir = File.join(@root, RTF_DIR)
52
+
53
+ # check source files for existance
54
+ @sources = []
55
+ source_file = dest_files = nil
56
+ Languages.new.values_for('source').each do |source, lang|
57
+ source_file = File.join(@root, source)
58
+
59
+ if File.exist?(source_file)
60
+ dest_files = []
61
+
62
+ # => create number of desired output files
63
+ (1..file_count).each do |i|
64
+ dest_files <<
65
+ File.join(@rtf_dir, source + "-#{@count}-#{@size}")
66
+ end
67
+
68
+ # => store filename
69
+ @sources << [lang, source_file, dest_files]
70
+ end
71
+ end
72
+
73
+ # => initialize random seed for different results
74
+ srand (Time.new.to_f * 10000).to_i
75
+
76
+ self.each &block if block_given?
77
+
78
+ self
79
+ end
80
+
81
+ alias :create :initialize
82
+
83
+ def each
84
+ # => create temp directory
85
+ FileUtils.mkdir_p(RTF_DIR)
86
+
87
+ @sources.each do |source| la, lang, filename = source
88
+ yield la, copy_random_pieces(filename, @file_count, @chunk_size)
89
+ end
90
+
91
+ self.clear
92
+ end
93
+
94
+ def clear
95
+ # FileUtils.rm(RLTF_DIRECTORY + '/*.txt')
96
+ FileUtils.rm_r(RLTF_DIRECTORY)
97
+ end
98
+
99
+ def copy_random_pieces(filename, file_count, chunk_size)
100
+ files = []
101
+
102
+ path = File.join(File.dirname(__FILE__), '..', '..')
103
+ source_file = File.join(path, filename)
104
+
105
+ # => remember size of source, so that we don't grab past EOF
106
+ file_size = File.exist?(source_file) ? File.size(source_file) : 0
107
+ return nil if chunk_size > file_size
108
+
109
+ # => open source file
110
+ File.open(source_file) do |file|
111
+
112
+ # => create number of desired output files
113
+ (1..file_count).each do |i|
114
+
115
+ # => grab a randomly piece of fixed size from the source
116
+ file.seek(rand(file_size-chunk_size), IO::SEEK_SET)
117
+ piece = file.read(chunk_size)
118
+
119
+ # => fix potentially broken utf-8 sequences at start of piece
120
+ piece = piece[1..-1] while (0x80..0xbf).include?(piece[0])
121
+ # => fix potentially broken utf-8 sequences at end of piece
122
+ piece = piece[0..-2] while (0x80..0xbf).include?(piece[-1])
123
+ piece = piece[0..-2] if (0xc0..0xff).include?(piece[-1])
124
+
125
+ # => write piece to test files
126
+ source_file =~ /([^\/]+).txt$/i
127
+ files << File.join(path, "#{RLTF_DIRECTORY}/#{$1}.#{chunk_size}.#{i}.txt")
128
+ File.open(files[-1], 'w') do |fout|
129
+ fout.write piece
130
+ end
131
+ end
132
+ end
133
+ files
134
+ end
135
+
136
+ end
137
+
138
+ #rlf = RandomTestFiles.new(5,100).each do |a,b|
139
+ # p [a, b]
140
+ #end
@@ -0,0 +1,53 @@
1
+ =begin
2
+
3
+ Copyright © 2007 John Vorhauer
4
+ Contact me at langa@vorhauer.de near 50°55'N+6°55'E.
5
+
6
+ This file is part of Langa.
7
+
8
+ Langa is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ Langa is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with Langa. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ For a detailed functional description of Langa see README file
22
+
23
+ =end
24
+
25
+ =begin rdoc
26
+
27
+ Extend class Array and class String to uniformly handle code conversion
28
+ to and from unicode
29
+
30
+ =end
31
+
32
+ class String
33
+
34
+ # Converts a string to an array of unicode values.
35
+ # 'äöü'.to_unicode => [228, 246, 252]
36
+ # 'äöü' as UTF-8 is equivalent to [195, 164, 195, 182, 195, 188]
37
+ def to_unicode
38
+ self.unpack('U*')
39
+ end
40
+
41
+ end
42
+
43
+ class Array
44
+
45
+ # Converts an array of unicodes values to an utf-8 coded string
46
+ # [228, 246, 252].to_uft8 -> 'äöü'
47
+
48
+ def to_utf8
49
+ self.pack('U*')
50
+ end
51
+
52
+ end
53
+