scylla 0.8.0 → 0.8.29
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/Gemfile.lock +9 -1
- data/lib/scylla/generator.rb +46 -13
- data/lib/scylla/lms/afrikaans.lm +400 -400
- data/lib/scylla/lms/arabic.lm +400 -400
- data/lib/scylla/lms/bulgarian.lm +400 -400
- data/lib/scylla/lms/catalan.lm +399 -399
- data/lib/scylla/lms/chinese.lm +400 -400
- data/lib/scylla/lms/czech.lm +400 -0
- data/lib/scylla/lms/danish.lm +396 -396
- data/lib/scylla/lms/dutch.lm +400 -0
- data/lib/scylla/lms/english.lm +400 -400
- data/lib/scylla/lms/finnish.lm +400 -400
- data/lib/scylla/lms/french.lm +398 -398
- data/lib/scylla/lms/german.lm +400 -400
- data/lib/scylla/lms/greek.lm +400 -400
- data/lib/scylla/lms/hebrew.lm +399 -399
- data/lib/scylla/lms/hindi.lm +400 -400
- data/lib/scylla/lms/icelandic.lm +399 -399
- data/lib/scylla/lms/indonesian.lm +400 -400
- data/lib/scylla/lms/italian.lm +400 -400
- data/lib/scylla/lms/japanese.lm +399 -399
- data/lib/scylla/lms/kannada.lm +400 -0
- data/lib/scylla/lms/korean.lm +400 -400
- data/lib/scylla/lms/marathi.lm +400 -0
- data/lib/scylla/lms/norwegian.lm +400 -400
- data/lib/scylla/lms/persian.lm +400 -0
- data/lib/scylla/lms/polish.lm +400 -400
- data/lib/scylla/lms/portuguese.lm +400 -400
- data/lib/scylla/lms/romanian.lm +400 -400
- data/lib/scylla/lms/russian.lm +400 -400
- data/lib/scylla/lms/slovak.lm +400 -400
- data/lib/scylla/lms/slovenian.lm +387 -387
- data/lib/scylla/lms/spanish.lm +400 -400
- data/lib/scylla/lms/swedish.lm +399 -399
- data/lib/scylla/lms/tagalog.lm +400 -400
- data/lib/scylla/lms/thai.lm +400 -400
- data/lib/scylla/lms/turkish.lm +400 -400
- data/lib/scylla/lms/vietnamese.lm +400 -400
- data/lib/scylla/lms/welsh.lm +398 -398
- data/lib/scylla/resources.rb +43 -33
- data/lib/scylla/string.rb +2 -2
- data/lib/scylla.rb +0 -4
- data/pkg/scylla-0.5.0.gem +0 -0
- data/scylla.gemspec +1 -1
- data/source_texts/afrikaans.txt +330 -81
- data/source_texts/arabic.txt +590 -448
- data/source_texts/bulgarian.txt +588 -821
- data/source_texts/catalan.txt +435 -413
- data/source_texts/chinese.txt +526 -100
- data/source_texts/czech.txt +237 -0
- data/source_texts/danish.txt +233 -184
- data/source_texts/dutch.txt +503 -0
- data/source_texts/english.txt +673 -70
- data/source_texts/finnish.txt +939 -71
- data/source_texts/french.txt +879 -465
- data/source_texts/german.txt +1236 -137
- data/source_texts/greek.txt +488 -139
- data/source_texts/hebrew.txt +539 -100
- data/source_texts/hindi.txt +254 -100
- data/source_texts/icelandic.txt +301 -90
- data/source_texts/indonesian.txt +509 -93
- data/source_texts/italian.txt +1066 -120
- data/source_texts/japanese.txt +1217 -450
- data/source_texts/kannada.txt +340 -0
- data/source_texts/korean.txt +343 -219
- data/source_texts/marathi.txt +237 -0
- data/source_texts/norwegian.txt +555 -190
- data/source_texts/persian.txt +886 -0
- data/source_texts/polish.txt +1013 -90
- data/source_texts/portuguese.txt +690 -88
- data/source_texts/romanian.txt +436 -103
- data/source_texts/russian.txt +1029 -100
- data/source_texts/slovak.txt +575 -102
- data/source_texts/slovenian.txt +353 -99
- data/source_texts/spanish.txt +858 -675
- data/source_texts/swedish.txt +558 -488
- data/source_texts/tagalog.txt +391 -100
- data/source_texts/thai.txt +286 -60
- data/source_texts/turkish.txt +635 -87
- data/source_texts/vietnamese.txt +300 -92
- data/source_texts/welsh.txt +288 -104
- data/test/fixtures/lms/danish.lm +314 -314
- data/test/fixtures/lms/english.lm +301 -301
- data/test/fixtures/lms/french.lm +326 -326
- data/test/fixtures/lms/german.lm +331 -331
- data/test/fixtures/lms/hindi.lm +191 -191
- data/test/fixtures/lms/italian.lm +299 -299
- data/test/fixtures/lms/japanese.lm +103 -103
- data/test/fixtures/lms/norwegian.lm +309 -309
- data/test/fixtures/lms/spanish.lm +331 -331
- data/test/generator_test.rb +2 -2
- metadata +14 -3
data/test/generator_test.rb
CHANGED
@@ -10,7 +10,7 @@ class GeneratorTest < Test::Unit::TestCase
|
|
10
10
|
@ngram_frequencies = [["_", 2], ["l", 2], ["lo_", 1], ["ello", 1], ["lo", 1], ["o", 1],
|
11
11
|
["llo", 1], ["hel", 1], ["o_", 1], ["ell", 1], ["e", 1], ["ello_", 1], ["_he", 1],
|
12
12
|
["el", 1], ["hello", 1], ["hell", 1], ["he", 1], ["_hel", 1], ["h", 1], ["_hell", 1],
|
13
|
-
["llo_", 1], ["_h", 1], ["ll", 1]]
|
13
|
+
["llo_", 1], ["_h", 1], ["ll", 1]]
|
14
14
|
end
|
15
15
|
|
16
16
|
should "create an array of ngrams for a given text input" do
|
@@ -39,7 +39,7 @@ class GeneratorTest < Test::Unit::TestCase
|
|
39
39
|
end
|
40
40
|
|
41
41
|
should "Remove characters that throw off language detection" do
|
42
|
-
assert_equal "
|
42
|
+
assert_equal "hello go to to watch some shitty videos woooooo friend win today", @sg.clean(@bad_text)
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scylla
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 5
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 8
|
9
|
-
-
|
10
|
-
version: 0.8.
|
9
|
+
- 29
|
10
|
+
version: 0.8.29
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Ashwin Hegde
|
@@ -70,7 +70,9 @@ files:
|
|
70
70
|
- lib/scylla/lms/bulgarian.lm
|
71
71
|
- lib/scylla/lms/catalan.lm
|
72
72
|
- lib/scylla/lms/chinese.lm
|
73
|
+
- lib/scylla/lms/czech.lm
|
73
74
|
- lib/scylla/lms/danish.lm
|
75
|
+
- lib/scylla/lms/dutch.lm
|
74
76
|
- lib/scylla/lms/english.lm
|
75
77
|
- lib/scylla/lms/finnish.lm
|
76
78
|
- lib/scylla/lms/french.lm
|
@@ -82,8 +84,11 @@ files:
|
|
82
84
|
- lib/scylla/lms/indonesian.lm
|
83
85
|
- lib/scylla/lms/italian.lm
|
84
86
|
- lib/scylla/lms/japanese.lm
|
87
|
+
- lib/scylla/lms/kannada.lm
|
85
88
|
- lib/scylla/lms/korean.lm
|
89
|
+
- lib/scylla/lms/marathi.lm
|
86
90
|
- lib/scylla/lms/norwegian.lm
|
91
|
+
- lib/scylla/lms/persian.lm
|
87
92
|
- lib/scylla/lms/polish.lm
|
88
93
|
- lib/scylla/lms/portuguese.lm
|
89
94
|
- lib/scylla/lms/romanian.lm
|
@@ -103,6 +108,7 @@ files:
|
|
103
108
|
- lib/scylla/tasks.rb
|
104
109
|
- lib/scylla.rb
|
105
110
|
- LICENSE.txt
|
111
|
+
- pkg/scylla-0.5.0.gem
|
106
112
|
- Rakefile
|
107
113
|
- README.rdoc
|
108
114
|
- scylla.gemspec
|
@@ -111,7 +117,9 @@ files:
|
|
111
117
|
- source_texts/bulgarian.txt
|
112
118
|
- source_texts/catalan.txt
|
113
119
|
- source_texts/chinese.txt
|
120
|
+
- source_texts/czech.txt
|
114
121
|
- source_texts/danish.txt
|
122
|
+
- source_texts/dutch.txt
|
115
123
|
- source_texts/english.txt
|
116
124
|
- source_texts/finnish.txt
|
117
125
|
- source_texts/french.txt
|
@@ -123,8 +131,11 @@ files:
|
|
123
131
|
- source_texts/indonesian.txt
|
124
132
|
- source_texts/italian.txt
|
125
133
|
- source_texts/japanese.txt
|
134
|
+
- source_texts/kannada.txt
|
126
135
|
- source_texts/korean.txt
|
136
|
+
- source_texts/marathi.txt
|
127
137
|
- source_texts/norwegian.txt
|
138
|
+
- source_texts/persian.txt
|
128
139
|
- source_texts/polish.txt
|
129
140
|
- source_texts/portuguese.txt
|
130
141
|
- source_texts/romanian.txt
|