scylla 0.8.32 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/scylla/generator.rb +6 -2
- data/lib/scylla/lms/arabic.lm +318 -318
- data/lib/scylla/lms/bulgarian.lm +326 -326
- data/lib/scylla/lms/chinese.lm +399 -399
- data/lib/scylla/lms/french.lm +302 -302
- data/lib/scylla/lms/greek.lm +119 -119
- data/lib/scylla/lms/hebrew.lm +168 -168
- data/lib/scylla/lms/hindi.lm +108 -108
- data/lib/scylla/lms/japanese.lm +65 -65
- data/lib/scylla/lms/kannada.lm +147 -147
- data/lib/scylla/lms/korean.lm +151 -151
- data/lib/scylla/lms/marathi.lm +133 -133
- data/lib/scylla/lms/persian.lm +107 -107
- data/lib/scylla/lms/polish.lm +108 -108
- data/lib/scylla/lms/portuguese.lm +221 -221
- data/lib/scylla/lms/romanian.lm +132 -132
- data/lib/scylla/lms/russian.lm +82 -82
- data/lib/scylla/lms/thai.lm +119 -119
- data/lib/scylla/resources.rb +0 -1
- data/test/helper.rb +0 -1
- metadata +40 -55
- data/Gemfile +0 -23
- data/Gemfile.lock +0 -53
- data/Rakefile +0 -52
- data/VERSION +0 -1
- data/lib/scylla/lms/afrikaans.lm +0 -400
- data/pkg/scylla-0.5.0.gem +0 -0
- data/scylla-0.8.29.gem +0 -0
- data/scylla-0.8.31.gem +0 -0
- data/scylla.gemspec +0 -24
- data/source_texts/afrikaans.txt +0 -363
- data/source_texts/arabic.txt +0 -718
- data/source_texts/bulgarian.txt +0 -601
- data/source_texts/catalan.txt +0 -435
- data/source_texts/chinese.txt +0 -625
- data/source_texts/czech.txt +0 -237
- data/source_texts/danish.txt +0 -268
- data/source_texts/dutch.txt +0 -503
- data/source_texts/english.txt +0 -673
- data/source_texts/finnish.txt +0 -939
- data/source_texts/french.txt +0 -896
- data/source_texts/german.txt +0 -1236
- data/source_texts/greek.txt +0 -488
- data/source_texts/hebrew.txt +0 -638
- data/source_texts/hindi.txt +0 -353
- data/source_texts/icelandic.txt +0 -342
- data/source_texts/indonesian.txt +0 -509
- data/source_texts/italian.txt +0 -1066
- data/source_texts/japanese.txt +0 -1220
- data/source_texts/kannada.txt +0 -340
- data/source_texts/korean.txt +0 -343
- data/source_texts/marathi.txt +0 -237
- data/source_texts/norwegian.txt +0 -555
- data/source_texts/persian.txt +0 -886
- data/source_texts/polish.txt +0 -1014
- data/source_texts/portuguese.txt +0 -690
- data/source_texts/romanian.txt +0 -436
- data/source_texts/russian.txt +0 -1128
- data/source_texts/slovak.txt +0 -575
- data/source_texts/slovenian.txt +0 -354
- data/source_texts/spanish.txt +0 -1017
- data/source_texts/swedish.txt +0 -558
- data/source_texts/tagalog.txt +0 -426
- data/source_texts/thai.txt +0 -312
- data/source_texts/turkish.txt +0 -665
- data/source_texts/vietnamese.txt +0 -300
- data/source_texts/welsh.txt +0 -332
data/lib/scylla/generator.rb
CHANGED
@@ -5,7 +5,7 @@ require 'unicode'
|
|
5
5
|
module Scylla
|
6
6
|
class Generator
|
7
7
|
attr_accessor :dirtext, :dirlm, :minsize, :delimiter
|
8
|
-
|
8
|
+
NONLATIN = ["bg","ar","ru","zh","ja","he","kn","ko","mr","hi","th","fa","el"]
|
9
9
|
# dirtext: The location of the source training text files
|
10
10
|
# minsize: The minimum size of the ngrams that you would like to store
|
11
11
|
def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]")
|
@@ -22,11 +22,13 @@ module Scylla
|
|
22
22
|
languages = Dir.glob(@dirlm + "/*.lm")
|
23
23
|
languages.each {|l| File.delete(l) }
|
24
24
|
locales = Scylla::Resources.locales
|
25
|
+
get_wikis
|
25
26
|
locales.each do |key, value|
|
26
27
|
path = File.join(@dirtext, "#{key}.txt")
|
27
28
|
text = ""
|
28
29
|
File.open(path).each { |line| text += " " + line }
|
29
30
|
write_lm(text, key)
|
31
|
+
File.delete(path)
|
30
32
|
end
|
31
33
|
end
|
32
34
|
|
@@ -53,7 +55,9 @@ module Scylla
|
|
53
55
|
value = value.gsub(/\{\{(.+?)\}\}/m,"")
|
54
56
|
value = value.gsub(/\{(.+?)\}/m,"")
|
55
57
|
value = value.gsub(/\[(.+?)\]/m,"")
|
56
|
-
Sanitize.clean(value)
|
58
|
+
value = Sanitize.clean(value)
|
59
|
+
value = value.gsub(/[a-zA-Z]/,"") if NONLATIN.include?(locale)
|
60
|
+
clean(value)
|
57
61
|
end
|
58
62
|
|
59
63
|
# Reads a single text file specified by a path and writes a .lm file in
|