scylla 0.8.0 → 0.8.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. data/Gemfile +4 -0
  2. data/Gemfile.lock +9 -1
  3. data/lib/scylla/generator.rb +46 -13
  4. data/lib/scylla/lms/afrikaans.lm +400 -400
  5. data/lib/scylla/lms/arabic.lm +400 -400
  6. data/lib/scylla/lms/bulgarian.lm +400 -400
  7. data/lib/scylla/lms/catalan.lm +399 -399
  8. data/lib/scylla/lms/chinese.lm +400 -400
  9. data/lib/scylla/lms/czech.lm +400 -0
  10. data/lib/scylla/lms/danish.lm +396 -396
  11. data/lib/scylla/lms/dutch.lm +400 -0
  12. data/lib/scylla/lms/english.lm +400 -400
  13. data/lib/scylla/lms/finnish.lm +400 -400
  14. data/lib/scylla/lms/french.lm +398 -398
  15. data/lib/scylla/lms/german.lm +400 -400
  16. data/lib/scylla/lms/greek.lm +400 -400
  17. data/lib/scylla/lms/hebrew.lm +399 -399
  18. data/lib/scylla/lms/hindi.lm +400 -400
  19. data/lib/scylla/lms/icelandic.lm +399 -399
  20. data/lib/scylla/lms/indonesian.lm +400 -400
  21. data/lib/scylla/lms/italian.lm +400 -400
  22. data/lib/scylla/lms/japanese.lm +399 -399
  23. data/lib/scylla/lms/kannada.lm +400 -0
  24. data/lib/scylla/lms/korean.lm +400 -400
  25. data/lib/scylla/lms/marathi.lm +400 -0
  26. data/lib/scylla/lms/norwegian.lm +400 -400
  27. data/lib/scylla/lms/persian.lm +400 -0
  28. data/lib/scylla/lms/polish.lm +400 -400
  29. data/lib/scylla/lms/portuguese.lm +400 -400
  30. data/lib/scylla/lms/romanian.lm +400 -400
  31. data/lib/scylla/lms/russian.lm +400 -400
  32. data/lib/scylla/lms/slovak.lm +400 -400
  33. data/lib/scylla/lms/slovenian.lm +387 -387
  34. data/lib/scylla/lms/spanish.lm +400 -400
  35. data/lib/scylla/lms/swedish.lm +399 -399
  36. data/lib/scylla/lms/tagalog.lm +400 -400
  37. data/lib/scylla/lms/thai.lm +400 -400
  38. data/lib/scylla/lms/turkish.lm +400 -400
  39. data/lib/scylla/lms/vietnamese.lm +400 -400
  40. data/lib/scylla/lms/welsh.lm +398 -398
  41. data/lib/scylla/resources.rb +43 -33
  42. data/lib/scylla/string.rb +2 -2
  43. data/lib/scylla.rb +0 -4
  44. data/pkg/scylla-0.5.0.gem +0 -0
  45. data/scylla.gemspec +1 -1
  46. data/source_texts/afrikaans.txt +330 -81
  47. data/source_texts/arabic.txt +590 -448
  48. data/source_texts/bulgarian.txt +588 -821
  49. data/source_texts/catalan.txt +435 -413
  50. data/source_texts/chinese.txt +526 -100
  51. data/source_texts/czech.txt +237 -0
  52. data/source_texts/danish.txt +233 -184
  53. data/source_texts/dutch.txt +503 -0
  54. data/source_texts/english.txt +673 -70
  55. data/source_texts/finnish.txt +939 -71
  56. data/source_texts/french.txt +879 -465
  57. data/source_texts/german.txt +1236 -137
  58. data/source_texts/greek.txt +488 -139
  59. data/source_texts/hebrew.txt +539 -100
  60. data/source_texts/hindi.txt +254 -100
  61. data/source_texts/icelandic.txt +301 -90
  62. data/source_texts/indonesian.txt +509 -93
  63. data/source_texts/italian.txt +1066 -120
  64. data/source_texts/japanese.txt +1217 -450
  65. data/source_texts/kannada.txt +340 -0
  66. data/source_texts/korean.txt +343 -219
  67. data/source_texts/marathi.txt +237 -0
  68. data/source_texts/norwegian.txt +555 -190
  69. data/source_texts/persian.txt +886 -0
  70. data/source_texts/polish.txt +1013 -90
  71. data/source_texts/portuguese.txt +690 -88
  72. data/source_texts/romanian.txt +436 -103
  73. data/source_texts/russian.txt +1029 -100
  74. data/source_texts/slovak.txt +575 -102
  75. data/source_texts/slovenian.txt +353 -99
  76. data/source_texts/spanish.txt +858 -675
  77. data/source_texts/swedish.txt +558 -488
  78. data/source_texts/tagalog.txt +391 -100
  79. data/source_texts/thai.txt +286 -60
  80. data/source_texts/turkish.txt +635 -87
  81. data/source_texts/vietnamese.txt +300 -92
  82. data/source_texts/welsh.txt +288 -104
  83. data/test/fixtures/lms/danish.lm +314 -314
  84. data/test/fixtures/lms/english.lm +301 -301
  85. data/test/fixtures/lms/french.lm +326 -326
  86. data/test/fixtures/lms/german.lm +331 -331
  87. data/test/fixtures/lms/hindi.lm +191 -191
  88. data/test/fixtures/lms/italian.lm +299 -299
  89. data/test/fixtures/lms/japanese.lm +103 -103
  90. data/test/fixtures/lms/norwegian.lm +309 -309
  91. data/test/fixtures/lms/spanish.lm +331 -331
  92. data/test/generator_test.rb +2 -2
  93. metadata +14 -3
@@ -10,7 +10,7 @@ class GeneratorTest < Test::Unit::TestCase
10
10
  @ngram_frequencies = [["_", 2], ["l", 2], ["lo_", 1], ["ello", 1], ["lo", 1], ["o", 1],
11
11
  ["llo", 1], ["hel", 1], ["o_", 1], ["ell", 1], ["e", 1], ["ello_", 1], ["_he", 1],
12
12
  ["el", 1], ["hello", 1], ["hell", 1], ["he", 1], ["_hel", 1], ["h", 1], ["_hell", 1],
13
- ["llo_", 1], ["_h", 1], ["ll", 1]]
13
+ ["llo_", 1], ["_h", 1], ["ll", 1]]
14
14
  end
15
15
 
16
16
  should "create an array of ngrams for a given text input" do
@@ -39,7 +39,7 @@ class GeneratorTest < Test::Unit::TestCase
39
39
  end
40
40
 
41
41
  should "Remove characters that throw off language detection" do
42
- assert_equal "Hello Go to to watch some shitty videos. Woooooo friend WIN TODAY", @sg.clean(@bad_text)
42
+ assert_equal "hello go to to watch some shitty videos woooooo friend win today", @sg.clean(@bad_text)
43
43
  end
44
44
  end
45
45
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scylla
3
3
  version: !ruby/object:Gem::Version
4
- hash: 63
4
+ hash: 5
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 8
9
- - 0
10
- version: 0.8.0
9
+ - 29
10
+ version: 0.8.29
11
11
  platform: ruby
12
12
  authors:
13
13
  - Ashwin Hegde
@@ -70,7 +70,9 @@ files:
70
70
  - lib/scylla/lms/bulgarian.lm
71
71
  - lib/scylla/lms/catalan.lm
72
72
  - lib/scylla/lms/chinese.lm
73
+ - lib/scylla/lms/czech.lm
73
74
  - lib/scylla/lms/danish.lm
75
+ - lib/scylla/lms/dutch.lm
74
76
  - lib/scylla/lms/english.lm
75
77
  - lib/scylla/lms/finnish.lm
76
78
  - lib/scylla/lms/french.lm
@@ -82,8 +84,11 @@ files:
82
84
  - lib/scylla/lms/indonesian.lm
83
85
  - lib/scylla/lms/italian.lm
84
86
  - lib/scylla/lms/japanese.lm
87
+ - lib/scylla/lms/kannada.lm
85
88
  - lib/scylla/lms/korean.lm
89
+ - lib/scylla/lms/marathi.lm
86
90
  - lib/scylla/lms/norwegian.lm
91
+ - lib/scylla/lms/persian.lm
87
92
  - lib/scylla/lms/polish.lm
88
93
  - lib/scylla/lms/portuguese.lm
89
94
  - lib/scylla/lms/romanian.lm
@@ -103,6 +108,7 @@ files:
103
108
  - lib/scylla/tasks.rb
104
109
  - lib/scylla.rb
105
110
  - LICENSE.txt
111
+ - pkg/scylla-0.5.0.gem
106
112
  - Rakefile
107
113
  - README.rdoc
108
114
  - scylla.gemspec
@@ -111,7 +117,9 @@ files:
111
117
  - source_texts/bulgarian.txt
112
118
  - source_texts/catalan.txt
113
119
  - source_texts/chinese.txt
120
+ - source_texts/czech.txt
114
121
  - source_texts/danish.txt
122
+ - source_texts/dutch.txt
115
123
  - source_texts/english.txt
116
124
  - source_texts/finnish.txt
117
125
  - source_texts/french.txt
@@ -123,8 +131,11 @@ files:
123
131
  - source_texts/indonesian.txt
124
132
  - source_texts/italian.txt
125
133
  - source_texts/japanese.txt
134
+ - source_texts/kannada.txt
126
135
  - source_texts/korean.txt
136
+ - source_texts/marathi.txt
127
137
  - source_texts/norwegian.txt
138
+ - source_texts/persian.txt
128
139
  - source_texts/polish.txt
129
140
  - source_texts/portuguese.txt
130
141
  - source_texts/romanian.txt