scylla 0.8.32 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. data/lib/scylla/generator.rb +6 -2
  2. data/lib/scylla/lms/arabic.lm +318 -318
  3. data/lib/scylla/lms/bulgarian.lm +326 -326
  4. data/lib/scylla/lms/chinese.lm +399 -399
  5. data/lib/scylla/lms/french.lm +302 -302
  6. data/lib/scylla/lms/greek.lm +119 -119
  7. data/lib/scylla/lms/hebrew.lm +168 -168
  8. data/lib/scylla/lms/hindi.lm +108 -108
  9. data/lib/scylla/lms/japanese.lm +65 -65
  10. data/lib/scylla/lms/kannada.lm +147 -147
  11. data/lib/scylla/lms/korean.lm +151 -151
  12. data/lib/scylla/lms/marathi.lm +133 -133
  13. data/lib/scylla/lms/persian.lm +107 -107
  14. data/lib/scylla/lms/polish.lm +108 -108
  15. data/lib/scylla/lms/portuguese.lm +221 -221
  16. data/lib/scylla/lms/romanian.lm +132 -132
  17. data/lib/scylla/lms/russian.lm +82 -82
  18. data/lib/scylla/lms/thai.lm +119 -119
  19. data/lib/scylla/resources.rb +0 -1
  20. data/test/helper.rb +0 -1
  21. metadata +40 -55
  22. data/Gemfile +0 -23
  23. data/Gemfile.lock +0 -53
  24. data/Rakefile +0 -52
  25. data/VERSION +0 -1
  26. data/lib/scylla/lms/afrikaans.lm +0 -400
  27. data/pkg/scylla-0.5.0.gem +0 -0
  28. data/scylla-0.8.29.gem +0 -0
  29. data/scylla-0.8.31.gem +0 -0
  30. data/scylla.gemspec +0 -24
  31. data/source_texts/afrikaans.txt +0 -363
  32. data/source_texts/arabic.txt +0 -718
  33. data/source_texts/bulgarian.txt +0 -601
  34. data/source_texts/catalan.txt +0 -435
  35. data/source_texts/chinese.txt +0 -625
  36. data/source_texts/czech.txt +0 -237
  37. data/source_texts/danish.txt +0 -268
  38. data/source_texts/dutch.txt +0 -503
  39. data/source_texts/english.txt +0 -673
  40. data/source_texts/finnish.txt +0 -939
  41. data/source_texts/french.txt +0 -896
  42. data/source_texts/german.txt +0 -1236
  43. data/source_texts/greek.txt +0 -488
  44. data/source_texts/hebrew.txt +0 -638
  45. data/source_texts/hindi.txt +0 -353
  46. data/source_texts/icelandic.txt +0 -342
  47. data/source_texts/indonesian.txt +0 -509
  48. data/source_texts/italian.txt +0 -1066
  49. data/source_texts/japanese.txt +0 -1220
  50. data/source_texts/kannada.txt +0 -340
  51. data/source_texts/korean.txt +0 -343
  52. data/source_texts/marathi.txt +0 -237
  53. data/source_texts/norwegian.txt +0 -555
  54. data/source_texts/persian.txt +0 -886
  55. data/source_texts/polish.txt +0 -1014
  56. data/source_texts/portuguese.txt +0 -690
  57. data/source_texts/romanian.txt +0 -436
  58. data/source_texts/russian.txt +0 -1128
  59. data/source_texts/slovak.txt +0 -575
  60. data/source_texts/slovenian.txt +0 -354
  61. data/source_texts/spanish.txt +0 -1017
  62. data/source_texts/swedish.txt +0 -558
  63. data/source_texts/tagalog.txt +0 -426
  64. data/source_texts/thai.txt +0 -312
  65. data/source_texts/turkish.txt +0 -665
  66. data/source_texts/vietnamese.txt +0 -300
  67. data/source_texts/welsh.txt +0 -332
@@ -5,7 +5,7 @@ require 'unicode'
5
5
  module Scylla
6
6
  class Generator
7
7
  attr_accessor :dirtext, :dirlm, :minsize, :delimiter
8
-
8
+ NONLATIN = ["bg","ar","ru","zh","ja","he","kn","ko","mr","hi","th","fa","el"]
9
9
  # dirtext: The location of the source training text files
10
10
  # minsize: The minimum size of the ngrams that you would like to store
11
11
  def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]")
@@ -22,11 +22,13 @@ module Scylla
22
22
  languages = Dir.glob(@dirlm + "/*.lm")
23
23
  languages.each {|l| File.delete(l) }
24
24
  locales = Scylla::Resources.locales
25
+ get_wikis
25
26
  locales.each do |key, value|
26
27
  path = File.join(@dirtext, "#{key}.txt")
27
28
  text = ""
28
29
  File.open(path).each { |line| text += " " + line }
29
30
  write_lm(text, key)
31
+ File.delete(path)
30
32
  end
31
33
  end
32
34
 
@@ -53,7 +55,9 @@ module Scylla
53
55
  value = value.gsub(/\{\{(.+?)\}\}/m,"")
54
56
  value = value.gsub(/\{(.+?)\}/m,"")
55
57
  value = value.gsub(/\[(.+?)\]/m,"")
56
- Sanitize.clean(value)
58
+ value = Sanitize.clean(value)
59
+ value = value.gsub(/[a-zA-Z]/,"") if NONLATIN.include?(locale)
60
+ clean(value)
57
61
  end
58
62
 
59
63
  # Reads a single text file specified by a path and writes a .lm file in