scylla 0.8.0 → 0.8.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. data/Gemfile +4 -0
  2. data/Gemfile.lock +9 -1
  3. data/lib/scylla/generator.rb +46 -13
  4. data/lib/scylla/lms/afrikaans.lm +400 -400
  5. data/lib/scylla/lms/arabic.lm +400 -400
  6. data/lib/scylla/lms/bulgarian.lm +400 -400
  7. data/lib/scylla/lms/catalan.lm +399 -399
  8. data/lib/scylla/lms/chinese.lm +400 -400
  9. data/lib/scylla/lms/czech.lm +400 -0
  10. data/lib/scylla/lms/danish.lm +396 -396
  11. data/lib/scylla/lms/dutch.lm +400 -0
  12. data/lib/scylla/lms/english.lm +400 -400
  13. data/lib/scylla/lms/finnish.lm +400 -400
  14. data/lib/scylla/lms/french.lm +398 -398
  15. data/lib/scylla/lms/german.lm +400 -400
  16. data/lib/scylla/lms/greek.lm +400 -400
  17. data/lib/scylla/lms/hebrew.lm +399 -399
  18. data/lib/scylla/lms/hindi.lm +400 -400
  19. data/lib/scylla/lms/icelandic.lm +399 -399
  20. data/lib/scylla/lms/indonesian.lm +400 -400
  21. data/lib/scylla/lms/italian.lm +400 -400
  22. data/lib/scylla/lms/japanese.lm +399 -399
  23. data/lib/scylla/lms/kannada.lm +400 -0
  24. data/lib/scylla/lms/korean.lm +400 -400
  25. data/lib/scylla/lms/marathi.lm +400 -0
  26. data/lib/scylla/lms/norwegian.lm +400 -400
  27. data/lib/scylla/lms/persian.lm +400 -0
  28. data/lib/scylla/lms/polish.lm +400 -400
  29. data/lib/scylla/lms/portuguese.lm +400 -400
  30. data/lib/scylla/lms/romanian.lm +400 -400
  31. data/lib/scylla/lms/russian.lm +400 -400
  32. data/lib/scylla/lms/slovak.lm +400 -400
  33. data/lib/scylla/lms/slovenian.lm +387 -387
  34. data/lib/scylla/lms/spanish.lm +400 -400
  35. data/lib/scylla/lms/swedish.lm +399 -399
  36. data/lib/scylla/lms/tagalog.lm +400 -400
  37. data/lib/scylla/lms/thai.lm +400 -400
  38. data/lib/scylla/lms/turkish.lm +400 -400
  39. data/lib/scylla/lms/vietnamese.lm +400 -400
  40. data/lib/scylla/lms/welsh.lm +398 -398
  41. data/lib/scylla/resources.rb +43 -33
  42. data/lib/scylla/string.rb +2 -2
  43. data/lib/scylla.rb +0 -4
  44. data/pkg/scylla-0.5.0.gem +0 -0
  45. data/scylla.gemspec +1 -1
  46. data/source_texts/afrikaans.txt +330 -81
  47. data/source_texts/arabic.txt +590 -448
  48. data/source_texts/bulgarian.txt +588 -821
  49. data/source_texts/catalan.txt +435 -413
  50. data/source_texts/chinese.txt +526 -100
  51. data/source_texts/czech.txt +237 -0
  52. data/source_texts/danish.txt +233 -184
  53. data/source_texts/dutch.txt +503 -0
  54. data/source_texts/english.txt +673 -70
  55. data/source_texts/finnish.txt +939 -71
  56. data/source_texts/french.txt +879 -465
  57. data/source_texts/german.txt +1236 -137
  58. data/source_texts/greek.txt +488 -139
  59. data/source_texts/hebrew.txt +539 -100
  60. data/source_texts/hindi.txt +254 -100
  61. data/source_texts/icelandic.txt +301 -90
  62. data/source_texts/indonesian.txt +509 -93
  63. data/source_texts/italian.txt +1066 -120
  64. data/source_texts/japanese.txt +1217 -450
  65. data/source_texts/kannada.txt +340 -0
  66. data/source_texts/korean.txt +343 -219
  67. data/source_texts/marathi.txt +237 -0
  68. data/source_texts/norwegian.txt +555 -190
  69. data/source_texts/persian.txt +886 -0
  70. data/source_texts/polish.txt +1013 -90
  71. data/source_texts/portuguese.txt +690 -88
  72. data/source_texts/romanian.txt +436 -103
  73. data/source_texts/russian.txt +1029 -100
  74. data/source_texts/slovak.txt +575 -102
  75. data/source_texts/slovenian.txt +353 -99
  76. data/source_texts/spanish.txt +858 -675
  77. data/source_texts/swedish.txt +558 -488
  78. data/source_texts/tagalog.txt +391 -100
  79. data/source_texts/thai.txt +286 -60
  80. data/source_texts/turkish.txt +635 -87
  81. data/source_texts/vietnamese.txt +300 -92
  82. data/source_texts/welsh.txt +288 -104
  83. data/test/fixtures/lms/danish.lm +314 -314
  84. data/test/fixtures/lms/english.lm +301 -301
  85. data/test/fixtures/lms/french.lm +326 -326
  86. data/test/fixtures/lms/german.lm +331 -331
  87. data/test/fixtures/lms/hindi.lm +191 -191
  88. data/test/fixtures/lms/italian.lm +299 -299
  89. data/test/fixtures/lms/japanese.lm +103 -103
  90. data/test/fixtures/lms/norwegian.lm +309 -309
  91. data/test/fixtures/lms/spanish.lm +331 -331
  92. data/test/generator_test.rb +2 -2
  93. metadata +14 -3
data/Gemfile CHANGED
@@ -10,6 +10,10 @@ group :development do
10
10
  gem "jeweler", "~> 1.6.4"
11
11
  gem "mail"
12
12
  gem "sanitize"
13
+ gem "character-encodings"
14
+ gem "wikipedia-client"
15
+ gem "json"
16
+ gem "unicode"
13
17
  end
14
18
 
15
19
  group :test do
data/Gemfile.lock CHANGED
@@ -1,6 +1,7 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
+ character-encodings (0.4.1)
4
5
  columnize (0.3.4)
5
6
  git (1.2.5)
6
7
  i18n (0.6.0)
@@ -8,6 +9,7 @@ GEM
8
9
  bundler (~> 1.0)
9
10
  git (>= 1.2.5)
10
11
  rake
12
+ json (1.6.3)
11
13
  linecache (0.46)
12
14
  rbx-require-relative (> 0.0.4)
13
15
  mail (2.3.0)
@@ -18,7 +20,7 @@ GEM
18
20
  mocha (0.9.12)
19
21
  nokogiri (1.4.7)
20
22
  polyglot (0.3.2)
21
- rake (0.9.2)
23
+ rake (0.9.2.2)
22
24
  rbx-require-relative (0.0.5)
23
25
  ruby-debug (0.10.4)
24
26
  columnize (>= 0.1)
@@ -31,15 +33,21 @@ GEM
31
33
  treetop (1.4.10)
32
34
  polyglot
33
35
  polyglot (>= 0.3.1)
36
+ unicode (0.4.0)
37
+ wikipedia-client (1.0.0)
34
38
 
35
39
  PLATFORMS
36
40
  ruby
37
41
 
38
42
  DEPENDENCIES
39
43
  bundler (~> 1.0.0)
44
+ character-encodings
40
45
  jeweler (~> 1.6.4)
46
+ json
41
47
  mail
42
48
  mocha
43
49
  ruby-debug (~> 0.10.4)
44
50
  sanitize
45
51
  shoulda
52
+ unicode
53
+ wikipedia-client
@@ -1,16 +1,19 @@
1
1
  require 'sanitize'
2
2
  require 'cgi'
3
+ require 'wikipedia'
4
+ require 'unicode'
3
5
 
4
6
  module Scylla
5
7
  class Generator
6
- attr_accessor :dirtext, :dirlm, :minsize
8
+ attr_accessor :dirtext, :dirlm, :minsize, :delimiter
7
9
 
8
10
  # dirtext: The location of the source training text files
9
11
  # minsize: The minimum size of the ngrams that you would like to store
10
- def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false)
12
+ def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]")
11
13
  @dirtext = dirtext
12
14
  @dirlm = dirlm
13
15
  @minsize = minsize
16
+ @delimiter = delimiter
14
17
  end
15
18
 
16
19
  # Loads all the .txt files in the specified source training text folder
@@ -18,21 +21,47 @@ module Scylla
18
21
  # lib/scylla/lms as .lm files
19
22
  def train
20
23
  languages = Dir.glob(@dirlm + "/*.lm")
21
- textpaths = Dir.glob(@dirtext + "/*.txt")
22
24
  languages.each {|l| File.delete(l) }
23
- textpaths.each do |path|
24
- write_lm(path)
25
+ locales = Scylla::Resources.locales
26
+ locales.each do |key, value|
27
+ path = File.join(@dirtext, "#{key}.txt")
28
+ text = ""
29
+ File.open(path).each { |line| text += " " + line }
30
+ write_lm(text, key)
25
31
  end
26
32
  end
33
+
34
+ def get_wikis
35
+ locales = Scylla::Resources.locales
36
+ locales.each do |key, value|
37
+ text = get_wiki(value[0],value[1])
38
+ textname = File.join(@dirtext, "#{key}.txt")
39
+ File.delete(textname) if File.exists?(textname)
40
+ File.open(textname, 'w') { |f| f.write(text) }
41
+ end
42
+ end
43
+
44
+ def get_wiki(locale,article)
45
+ Wikipedia.Configure {
46
+ domain "#{locale}.wikipedia.org"
47
+ path 'w/api.php'
48
+ }
49
+ p article
50
+ page = Wikipedia.find( article )
51
+ value = page.content.gsub(/\{\{(.*?)\}\}/,"")
52
+ value = value.gsub(/\[\[(.+?)\]\]/m,"")
53
+ value = value.gsub(/\{\{(.+?)\}\}/m,"")
54
+ value = value.gsub(/\{(.+?)\}/m,"")
55
+ value = value.gsub(/\[(.+?)\]/m,"")
56
+ Sanitize.clean(value)
57
+ end
27
58
 
28
59
  # Reads a single text file specified by a path and writes a .lm file in
29
60
  # lib/scylla/lms
30
- def write_lm(path)
31
- text = ""
32
- File.open(path).each { |line| text += " " + line }
33
- p "Creating language map for " + path
61
+ def write_lm(text, language)
62
+ p "Creating language map for #{language}"
34
63
  lm = create_lm(text, true)
35
- lmname = File.join(@dirlm, File.basename(path, ".txt") + ".lm")
64
+ lmname = File.join(@dirlm, "#{language}.lm")
36
65
  File.delete(lmname) if File.exists?(lmname)
37
66
  File.open(lmname, 'w') do |f|
38
67
  i = 0
@@ -45,11 +74,13 @@ module Scylla
45
74
  end
46
75
 
47
76
  def clean(string)
77
+ delimit = string.index(@delimiter)
78
+ string = string[0, delimit] if delimit
48
79
  string = Sanitize.clean(string)
49
80
  string = CGI.unescapeHTML(string)
50
81
  string.gsub!(/(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/, "")
51
- string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;=\?@\{\}\[\]|\-\n\r0-9]/," ")
52
- string.strip.split(" ").join(" ")
82
+ string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;,._\/=\?@\{\}\[\]|\-\n\r0-9]/," ")
83
+ Unicode::downcase(string.strip.split(" ").join(" "))
53
84
  end
54
85
 
55
86
  # Creates a language map for a given input string.
@@ -57,12 +88,14 @@ module Scylla
57
88
  # return the freqencies of the ngrams, or simply an array in sorted order
58
89
  def create_lm(input, frequencies = false)
59
90
  input = clean(input)
91
+ debugger
60
92
  ngram = Hash.new
61
93
  input.split(/[\d\s\[\]]/).each do |word|
62
94
  word = "_" + word + "_";
63
95
  len = word.size
64
96
  for i in 0..word.size
65
- (1..5).each do |j|
97
+ for j in (1..3)
98
+ next unless word[i,j]
66
99
  ngram[word[i,j]] ||= 0
67
100
  ngram[word[i,j]] += 1 if (len > (j - 1))
68
101
  end