scylla 0.8.0 → 0.8.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. data/Gemfile +4 -0
  2. data/Gemfile.lock +9 -1
  3. data/lib/scylla/generator.rb +46 -13
  4. data/lib/scylla/lms/afrikaans.lm +400 -400
  5. data/lib/scylla/lms/arabic.lm +400 -400
  6. data/lib/scylla/lms/bulgarian.lm +400 -400
  7. data/lib/scylla/lms/catalan.lm +399 -399
  8. data/lib/scylla/lms/chinese.lm +400 -400
  9. data/lib/scylla/lms/czech.lm +400 -0
  10. data/lib/scylla/lms/danish.lm +396 -396
  11. data/lib/scylla/lms/dutch.lm +400 -0
  12. data/lib/scylla/lms/english.lm +400 -400
  13. data/lib/scylla/lms/finnish.lm +400 -400
  14. data/lib/scylla/lms/french.lm +398 -398
  15. data/lib/scylla/lms/german.lm +400 -400
  16. data/lib/scylla/lms/greek.lm +400 -400
  17. data/lib/scylla/lms/hebrew.lm +399 -399
  18. data/lib/scylla/lms/hindi.lm +400 -400
  19. data/lib/scylla/lms/icelandic.lm +399 -399
  20. data/lib/scylla/lms/indonesian.lm +400 -400
  21. data/lib/scylla/lms/italian.lm +400 -400
  22. data/lib/scylla/lms/japanese.lm +399 -399
  23. data/lib/scylla/lms/kannada.lm +400 -0
  24. data/lib/scylla/lms/korean.lm +400 -400
  25. data/lib/scylla/lms/marathi.lm +400 -0
  26. data/lib/scylla/lms/norwegian.lm +400 -400
  27. data/lib/scylla/lms/persian.lm +400 -0
  28. data/lib/scylla/lms/polish.lm +400 -400
  29. data/lib/scylla/lms/portuguese.lm +400 -400
  30. data/lib/scylla/lms/romanian.lm +400 -400
  31. data/lib/scylla/lms/russian.lm +400 -400
  32. data/lib/scylla/lms/slovak.lm +400 -400
  33. data/lib/scylla/lms/slovenian.lm +387 -387
  34. data/lib/scylla/lms/spanish.lm +400 -400
  35. data/lib/scylla/lms/swedish.lm +399 -399
  36. data/lib/scylla/lms/tagalog.lm +400 -400
  37. data/lib/scylla/lms/thai.lm +400 -400
  38. data/lib/scylla/lms/turkish.lm +400 -400
  39. data/lib/scylla/lms/vietnamese.lm +400 -400
  40. data/lib/scylla/lms/welsh.lm +398 -398
  41. data/lib/scylla/resources.rb +43 -33
  42. data/lib/scylla/string.rb +2 -2
  43. data/lib/scylla.rb +0 -4
  44. data/pkg/scylla-0.5.0.gem +0 -0
  45. data/scylla.gemspec +1 -1
  46. data/source_texts/afrikaans.txt +330 -81
  47. data/source_texts/arabic.txt +590 -448
  48. data/source_texts/bulgarian.txt +588 -821
  49. data/source_texts/catalan.txt +435 -413
  50. data/source_texts/chinese.txt +526 -100
  51. data/source_texts/czech.txt +237 -0
  52. data/source_texts/danish.txt +233 -184
  53. data/source_texts/dutch.txt +503 -0
  54. data/source_texts/english.txt +673 -70
  55. data/source_texts/finnish.txt +939 -71
  56. data/source_texts/french.txt +879 -465
  57. data/source_texts/german.txt +1236 -137
  58. data/source_texts/greek.txt +488 -139
  59. data/source_texts/hebrew.txt +539 -100
  60. data/source_texts/hindi.txt +254 -100
  61. data/source_texts/icelandic.txt +301 -90
  62. data/source_texts/indonesian.txt +509 -93
  63. data/source_texts/italian.txt +1066 -120
  64. data/source_texts/japanese.txt +1217 -450
  65. data/source_texts/kannada.txt +340 -0
  66. data/source_texts/korean.txt +343 -219
  67. data/source_texts/marathi.txt +237 -0
  68. data/source_texts/norwegian.txt +555 -190
  69. data/source_texts/persian.txt +886 -0
  70. data/source_texts/polish.txt +1013 -90
  71. data/source_texts/portuguese.txt +690 -88
  72. data/source_texts/romanian.txt +436 -103
  73. data/source_texts/russian.txt +1029 -100
  74. data/source_texts/slovak.txt +575 -102
  75. data/source_texts/slovenian.txt +353 -99
  76. data/source_texts/spanish.txt +858 -675
  77. data/source_texts/swedish.txt +558 -488
  78. data/source_texts/tagalog.txt +391 -100
  79. data/source_texts/thai.txt +286 -60
  80. data/source_texts/turkish.txt +635 -87
  81. data/source_texts/vietnamese.txt +300 -92
  82. data/source_texts/welsh.txt +288 -104
  83. data/test/fixtures/lms/danish.lm +314 -314
  84. data/test/fixtures/lms/english.lm +301 -301
  85. data/test/fixtures/lms/french.lm +326 -326
  86. data/test/fixtures/lms/german.lm +331 -331
  87. data/test/fixtures/lms/hindi.lm +191 -191
  88. data/test/fixtures/lms/italian.lm +299 -299
  89. data/test/fixtures/lms/japanese.lm +103 -103
  90. data/test/fixtures/lms/norwegian.lm +309 -309
  91. data/test/fixtures/lms/spanish.lm +331 -331
  92. data/test/generator_test.rb +2 -2
  93. metadata +14 -3
data/Gemfile CHANGED
@@ -10,6 +10,10 @@ group :development do
10
10
  gem "jeweler", "~> 1.6.4"
11
11
  gem "mail"
12
12
  gem "sanitize"
13
+ gem "character-encodings"
14
+ gem "wikipedia-client"
15
+ gem "json"
16
+ gem "unicode"
13
17
  end
14
18
 
15
19
  group :test do
data/Gemfile.lock CHANGED
@@ -1,6 +1,7 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
+ character-encodings (0.4.1)
4
5
  columnize (0.3.4)
5
6
  git (1.2.5)
6
7
  i18n (0.6.0)
@@ -8,6 +9,7 @@ GEM
8
9
  bundler (~> 1.0)
9
10
  git (>= 1.2.5)
10
11
  rake
12
+ json (1.6.3)
11
13
  linecache (0.46)
12
14
  rbx-require-relative (> 0.0.4)
13
15
  mail (2.3.0)
@@ -18,7 +20,7 @@ GEM
18
20
  mocha (0.9.12)
19
21
  nokogiri (1.4.7)
20
22
  polyglot (0.3.2)
21
- rake (0.9.2)
23
+ rake (0.9.2.2)
22
24
  rbx-require-relative (0.0.5)
23
25
  ruby-debug (0.10.4)
24
26
  columnize (>= 0.1)
@@ -31,15 +33,21 @@ GEM
31
33
  treetop (1.4.10)
32
34
  polyglot
33
35
  polyglot (>= 0.3.1)
36
+ unicode (0.4.0)
37
+ wikipedia-client (1.0.0)
34
38
 
35
39
  PLATFORMS
36
40
  ruby
37
41
 
38
42
  DEPENDENCIES
39
43
  bundler (~> 1.0.0)
44
+ character-encodings
40
45
  jeweler (~> 1.6.4)
46
+ json
41
47
  mail
42
48
  mocha
43
49
  ruby-debug (~> 0.10.4)
44
50
  sanitize
45
51
  shoulda
52
+ unicode
53
+ wikipedia-client
@@ -1,16 +1,19 @@
1
1
  require 'sanitize'
2
2
  require 'cgi'
3
+ require 'wikipedia'
4
+ require 'unicode'
3
5
 
4
6
  module Scylla
5
7
  class Generator
6
- attr_accessor :dirtext, :dirlm, :minsize
8
+ attr_accessor :dirtext, :dirlm, :minsize, :delimiter
7
9
 
8
10
  # dirtext: The location of the source training text files
9
11
  # minsize: The minimum size of the ngrams that you would like to store
10
- def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false)
12
+ def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]")
11
13
  @dirtext = dirtext
12
14
  @dirlm = dirlm
13
15
  @minsize = minsize
16
+ @delimiter = delimiter
14
17
  end
15
18
 
16
19
  # Loads all the .txt files in the specified source training text folder
@@ -18,21 +21,47 @@ module Scylla
18
21
  # lib/scylla/lms as .lm files
19
22
  def train
20
23
  languages = Dir.glob(@dirlm + "/*.lm")
21
- textpaths = Dir.glob(@dirtext + "/*.txt")
22
24
  languages.each {|l| File.delete(l) }
23
- textpaths.each do |path|
24
- write_lm(path)
25
+ locales = Scylla::Resources.locales
26
+ locales.each do |key, value|
27
+ path = File.join(@dirtext, "#{key}.txt")
28
+ text = ""
29
+ File.open(path).each { |line| text += " " + line }
30
+ write_lm(text, key)
25
31
  end
26
32
  end
33
+
34
+ def get_wikis
35
+ locales = Scylla::Resources.locales
36
+ locales.each do |key, value|
37
+ text = get_wiki(value[0],value[1])
38
+ textname = File.join(@dirtext, "#{key}.txt")
39
+ File.delete(textname) if File.exists?(textname)
40
+ File.open(textname, 'w') { |f| f.write(text) }
41
+ end
42
+ end
43
+
44
+ def get_wiki(locale,article)
45
+ Wikipedia.Configure {
46
+ domain "#{locale}.wikipedia.org"
47
+ path 'w/api.php'
48
+ }
49
+ p article
50
+ page = Wikipedia.find( article )
51
+ value = page.content.gsub(/\{\{(.*?)\}\}/,"")
52
+ value = value.gsub(/\[\[(.+?)\]\]/m,"")
53
+ value = value.gsub(/\{\{(.+?)\}\}/m,"")
54
+ value = value.gsub(/\{(.+?)\}/m,"")
55
+ value = value.gsub(/\[(.+?)\]/m,"")
56
+ Sanitize.clean(value)
57
+ end
27
58
 
28
59
  # Reads a single text file specified by a path and writes a .lm file in
29
60
  # lib/scylla/lms
30
- def write_lm(path)
31
- text = ""
32
- File.open(path).each { |line| text += " " + line }
33
- p "Creating language map for " + path
61
+ def write_lm(text, language)
62
+ p "Creating language map for #{language}"
34
63
  lm = create_lm(text, true)
35
- lmname = File.join(@dirlm, File.basename(path, ".txt") + ".lm")
64
+ lmname = File.join(@dirlm, "#{language}.lm")
36
65
  File.delete(lmname) if File.exists?(lmname)
37
66
  File.open(lmname, 'w') do |f|
38
67
  i = 0
@@ -45,11 +74,13 @@ module Scylla
45
74
  end
46
75
 
47
76
  def clean(string)
77
+ delimit = string.index(@delimiter)
78
+ string = string[0, delimit] if delimit
48
79
  string = Sanitize.clean(string)
49
80
  string = CGI.unescapeHTML(string)
50
81
  string.gsub!(/(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/, "")
51
- string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;=\?@\{\}\[\]|\-\n\r0-9]/," ")
52
- string.strip.split(" ").join(" ")
82
+ string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;,._\/=\?@\{\}\[\]|\-\n\r0-9]/," ")
83
+ Unicode::downcase(string.strip.split(" ").join(" "))
53
84
  end
54
85
 
55
86
  # Creates a language map for a given input string.
@@ -57,12 +88,14 @@ module Scylla
57
88
  # return the freqencies of the ngrams, or simply an array in sorted order
58
89
  def create_lm(input, frequencies = false)
59
90
  input = clean(input)
91
+ debugger
60
92
  ngram = Hash.new
61
93
  input.split(/[\d\s\[\]]/).each do |word|
62
94
  word = "_" + word + "_";
63
95
  len = word.size
64
96
  for i in 0..word.size
65
- (1..5).each do |j|
97
+ for j in (1..3)
98
+ next unless word[i,j]
66
99
  ngram[word[i,j]] ||= 0
67
100
  ngram[word[i,j]] += 1 if (len > (j - 1))
68
101
  end