scylla 0.8.0 → 0.8.29
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/Gemfile.lock +9 -1
- data/lib/scylla/generator.rb +46 -13
- data/lib/scylla/lms/afrikaans.lm +400 -400
- data/lib/scylla/lms/arabic.lm +400 -400
- data/lib/scylla/lms/bulgarian.lm +400 -400
- data/lib/scylla/lms/catalan.lm +399 -399
- data/lib/scylla/lms/chinese.lm +400 -400
- data/lib/scylla/lms/czech.lm +400 -0
- data/lib/scylla/lms/danish.lm +396 -396
- data/lib/scylla/lms/dutch.lm +400 -0
- data/lib/scylla/lms/english.lm +400 -400
- data/lib/scylla/lms/finnish.lm +400 -400
- data/lib/scylla/lms/french.lm +398 -398
- data/lib/scylla/lms/german.lm +400 -400
- data/lib/scylla/lms/greek.lm +400 -400
- data/lib/scylla/lms/hebrew.lm +399 -399
- data/lib/scylla/lms/hindi.lm +400 -400
- data/lib/scylla/lms/icelandic.lm +399 -399
- data/lib/scylla/lms/indonesian.lm +400 -400
- data/lib/scylla/lms/italian.lm +400 -400
- data/lib/scylla/lms/japanese.lm +399 -399
- data/lib/scylla/lms/kannada.lm +400 -0
- data/lib/scylla/lms/korean.lm +400 -400
- data/lib/scylla/lms/marathi.lm +400 -0
- data/lib/scylla/lms/norwegian.lm +400 -400
- data/lib/scylla/lms/persian.lm +400 -0
- data/lib/scylla/lms/polish.lm +400 -400
- data/lib/scylla/lms/portuguese.lm +400 -400
- data/lib/scylla/lms/romanian.lm +400 -400
- data/lib/scylla/lms/russian.lm +400 -400
- data/lib/scylla/lms/slovak.lm +400 -400
- data/lib/scylla/lms/slovenian.lm +387 -387
- data/lib/scylla/lms/spanish.lm +400 -400
- data/lib/scylla/lms/swedish.lm +399 -399
- data/lib/scylla/lms/tagalog.lm +400 -400
- data/lib/scylla/lms/thai.lm +400 -400
- data/lib/scylla/lms/turkish.lm +400 -400
- data/lib/scylla/lms/vietnamese.lm +400 -400
- data/lib/scylla/lms/welsh.lm +398 -398
- data/lib/scylla/resources.rb +43 -33
- data/lib/scylla/string.rb +2 -2
- data/lib/scylla.rb +0 -4
- data/pkg/scylla-0.5.0.gem +0 -0
- data/scylla.gemspec +1 -1
- data/source_texts/afrikaans.txt +330 -81
- data/source_texts/arabic.txt +590 -448
- data/source_texts/bulgarian.txt +588 -821
- data/source_texts/catalan.txt +435 -413
- data/source_texts/chinese.txt +526 -100
- data/source_texts/czech.txt +237 -0
- data/source_texts/danish.txt +233 -184
- data/source_texts/dutch.txt +503 -0
- data/source_texts/english.txt +673 -70
- data/source_texts/finnish.txt +939 -71
- data/source_texts/french.txt +879 -465
- data/source_texts/german.txt +1236 -137
- data/source_texts/greek.txt +488 -139
- data/source_texts/hebrew.txt +539 -100
- data/source_texts/hindi.txt +254 -100
- data/source_texts/icelandic.txt +301 -90
- data/source_texts/indonesian.txt +509 -93
- data/source_texts/italian.txt +1066 -120
- data/source_texts/japanese.txt +1217 -450
- data/source_texts/kannada.txt +340 -0
- data/source_texts/korean.txt +343 -219
- data/source_texts/marathi.txt +237 -0
- data/source_texts/norwegian.txt +555 -190
- data/source_texts/persian.txt +886 -0
- data/source_texts/polish.txt +1013 -90
- data/source_texts/portuguese.txt +690 -88
- data/source_texts/romanian.txt +436 -103
- data/source_texts/russian.txt +1029 -100
- data/source_texts/slovak.txt +575 -102
- data/source_texts/slovenian.txt +353 -99
- data/source_texts/spanish.txt +858 -675
- data/source_texts/swedish.txt +558 -488
- data/source_texts/tagalog.txt +391 -100
- data/source_texts/thai.txt +286 -60
- data/source_texts/turkish.txt +635 -87
- data/source_texts/vietnamese.txt +300 -92
- data/source_texts/welsh.txt +288 -104
- data/test/fixtures/lms/danish.lm +314 -314
- data/test/fixtures/lms/english.lm +301 -301
- data/test/fixtures/lms/french.lm +326 -326
- data/test/fixtures/lms/german.lm +331 -331
- data/test/fixtures/lms/hindi.lm +191 -191
- data/test/fixtures/lms/italian.lm +299 -299
- data/test/fixtures/lms/japanese.lm +103 -103
- data/test/fixtures/lms/norwegian.lm +309 -309
- data/test/fixtures/lms/spanish.lm +331 -331
- data/test/generator_test.rb +2 -2
- metadata +14 -3
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
+
character-encodings (0.4.1)
|
4
5
|
columnize (0.3.4)
|
5
6
|
git (1.2.5)
|
6
7
|
i18n (0.6.0)
|
@@ -8,6 +9,7 @@ GEM
|
|
8
9
|
bundler (~> 1.0)
|
9
10
|
git (>= 1.2.5)
|
10
11
|
rake
|
12
|
+
json (1.6.3)
|
11
13
|
linecache (0.46)
|
12
14
|
rbx-require-relative (> 0.0.4)
|
13
15
|
mail (2.3.0)
|
@@ -18,7 +20,7 @@ GEM
|
|
18
20
|
mocha (0.9.12)
|
19
21
|
nokogiri (1.4.7)
|
20
22
|
polyglot (0.3.2)
|
21
|
-
rake (0.9.2)
|
23
|
+
rake (0.9.2.2)
|
22
24
|
rbx-require-relative (0.0.5)
|
23
25
|
ruby-debug (0.10.4)
|
24
26
|
columnize (>= 0.1)
|
@@ -31,15 +33,21 @@ GEM
|
|
31
33
|
treetop (1.4.10)
|
32
34
|
polyglot
|
33
35
|
polyglot (>= 0.3.1)
|
36
|
+
unicode (0.4.0)
|
37
|
+
wikipedia-client (1.0.0)
|
34
38
|
|
35
39
|
PLATFORMS
|
36
40
|
ruby
|
37
41
|
|
38
42
|
DEPENDENCIES
|
39
43
|
bundler (~> 1.0.0)
|
44
|
+
character-encodings
|
40
45
|
jeweler (~> 1.6.4)
|
46
|
+
json
|
41
47
|
mail
|
42
48
|
mocha
|
43
49
|
ruby-debug (~> 0.10.4)
|
44
50
|
sanitize
|
45
51
|
shoulda
|
52
|
+
unicode
|
53
|
+
wikipedia-client
|
data/lib/scylla/generator.rb
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
require 'sanitize'
|
2
2
|
require 'cgi'
|
3
|
+
require 'wikipedia'
|
4
|
+
require 'unicode'
|
3
5
|
|
4
6
|
module Scylla
|
5
7
|
class Generator
|
6
|
-
attr_accessor :dirtext, :dirlm, :minsize
|
8
|
+
attr_accessor :dirtext, :dirlm, :minsize, :delimiter
|
7
9
|
|
8
10
|
# dirtext: The location of the source training text files
|
9
11
|
# minsize: The minimum size of the ngrams that you would like to store
|
10
|
-
def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false)
|
12
|
+
def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]")
|
11
13
|
@dirtext = dirtext
|
12
14
|
@dirlm = dirlm
|
13
15
|
@minsize = minsize
|
16
|
+
@delimiter = delimiter
|
14
17
|
end
|
15
18
|
|
16
19
|
# Loads all the .txt files in the specified source training text folder
|
@@ -18,21 +21,47 @@ module Scylla
|
|
18
21
|
# lib/scylla/lms as .lm files
|
19
22
|
def train
|
20
23
|
languages = Dir.glob(@dirlm + "/*.lm")
|
21
|
-
textpaths = Dir.glob(@dirtext + "/*.txt")
|
22
24
|
languages.each {|l| File.delete(l) }
|
23
|
-
|
24
|
-
|
25
|
+
locales = Scylla::Resources.locales
|
26
|
+
locales.each do |key, value|
|
27
|
+
path = File.join(@dirtext, "#{key}.txt")
|
28
|
+
text = ""
|
29
|
+
File.open(path).each { |line| text += " " + line }
|
30
|
+
write_lm(text, key)
|
25
31
|
end
|
26
32
|
end
|
33
|
+
|
34
|
+
def get_wikis
|
35
|
+
locales = Scylla::Resources.locales
|
36
|
+
locales.each do |key, value|
|
37
|
+
text = get_wiki(value[0],value[1])
|
38
|
+
textname = File.join(@dirtext, "#{key}.txt")
|
39
|
+
File.delete(textname) if File.exists?(textname)
|
40
|
+
File.open(textname, 'w') { |f| f.write(text) }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_wiki(locale,article)
|
45
|
+
Wikipedia.Configure {
|
46
|
+
domain "#{locale}.wikipedia.org"
|
47
|
+
path 'w/api.php'
|
48
|
+
}
|
49
|
+
p article
|
50
|
+
page = Wikipedia.find( article )
|
51
|
+
value = page.content.gsub(/\{\{(.*?)\}\}/,"")
|
52
|
+
value = value.gsub(/\[\[(.+?)\]\]/m,"")
|
53
|
+
value = value.gsub(/\{\{(.+?)\}\}/m,"")
|
54
|
+
value = value.gsub(/\{(.+?)\}/m,"")
|
55
|
+
value = value.gsub(/\[(.+?)\]/m,"")
|
56
|
+
Sanitize.clean(value)
|
57
|
+
end
|
27
58
|
|
28
59
|
# Reads a single text file specified by a path and writes a .lm file in
|
29
60
|
# lib/scylla/lms
|
30
|
-
def write_lm(
|
31
|
-
|
32
|
-
File.open(path).each { |line| text += " " + line }
|
33
|
-
p "Creating language map for " + path
|
61
|
+
def write_lm(text, language)
|
62
|
+
p "Creating language map for #{language}"
|
34
63
|
lm = create_lm(text, true)
|
35
|
-
lmname = File.join(@dirlm,
|
64
|
+
lmname = File.join(@dirlm, "#{language}.lm")
|
36
65
|
File.delete(lmname) if File.exists?(lmname)
|
37
66
|
File.open(lmname, 'w') do |f|
|
38
67
|
i = 0
|
@@ -45,11 +74,13 @@ module Scylla
|
|
45
74
|
end
|
46
75
|
|
47
76
|
def clean(string)
|
77
|
+
delimit = string.index(@delimiter)
|
78
|
+
string = string[0, delimit] if delimit
|
48
79
|
string = Sanitize.clean(string)
|
49
80
|
string = CGI.unescapeHTML(string)
|
50
81
|
string.gsub!(/(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/, "")
|
51
|
-
string.gsub!(/[\*\^><!\"#\$%&\'\(\)
|
52
|
-
string.strip.split(" ").join(" ")
|
82
|
+
string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;,._\/=\?@\{\}\[\]|\-\n\r0-9]/," ")
|
83
|
+
Unicode::downcase(string.strip.split(" ").join(" "))
|
53
84
|
end
|
54
85
|
|
55
86
|
# Creates a language map for a given input string.
|
@@ -57,12 +88,14 @@ module Scylla
|
|
57
88
|
# return the freqencies of the ngrams, or simply an array in sorted order
|
58
89
|
def create_lm(input, frequencies = false)
|
59
90
|
input = clean(input)
|
91
|
+
debugger
|
60
92
|
ngram = Hash.new
|
61
93
|
input.split(/[\d\s\[\]]/).each do |word|
|
62
94
|
word = "_" + word + "_";
|
63
95
|
len = word.size
|
64
96
|
for i in 0..word.size
|
65
|
-
(1..
|
97
|
+
for j in (1..3)
|
98
|
+
next unless word[i,j]
|
66
99
|
ngram[word[i,j]] ||= 0
|
67
100
|
ngram[word[i,j]] += 1 if (len > (j - 1))
|
68
101
|
end
|