treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.2
1
+ Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 2.0.0rc1
2
2
 
3
3
  This program is free software: you can redistribute it and/or modify
4
4
  it under the terms of the GNU General Public License as published by
@@ -15,7 +15,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
16
  Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
17
17
 
18
- Non-trivial amount of code has been incorporated and modified from other libraries:
18
+ A non-trivial amount of code has been incorporated and modified from other libraries:
19
19
 
20
20
  - formatters/readers/odt.rb - Mark Watson (GPL license)
21
21
  - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
data/README.md CHANGED
@@ -1,33 +1,24 @@
1
1
  [![Build Status](https://secure.travis-ci.org/louismullie/treat.png)](http://travis-ci.org/#!/louismullie/treat)
2
- [![Dependency Status](https://gemnasium.com/louismullie/treat.png)](https://gemnasium.com/louismullie/treat)
3
-
4
- Treat is a framework for natural language processing and computational linguistics in Ruby. It provides a common API for a number of gems and external libraries for document retrieval, parsing, annotation, and information extraction.
2
+ [![Code Climate](https://codeclimate.com/badge.png)](https://codeclimate.com/github/louismullie/treat)
5
3
 
6
- **Current features**
4
+ Treat is a toolkit for natural language processing and computational linguistics in Ruby. The Treat project aims to build a language- and algorithm- agnostic NLP framework for Ruby with support for tasks such as document retrieval, text chunking, segmentation and tokenization, natural language parsing, part-of-speech tagging, keyword extraction and named entity recognition. Learn more by taking a [quick tour](https://github.com/louismullie/treat/wiki/Quick-Tour) or by reading the [manual](https://github.com/louismullie/treat/wiki/Manual).
5
+
6
+ **Features**
7
7
 
8
8
  * Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus).
9
- * Text retrieval with indexation and full-text search (Ferret).
10
- * Text chunkers, sentence segmenters, tokenizers, and parsers for several languages (Stanford & Enju).
11
- * Word inflectors, including stemmers, conjugators, declensors, and number inflection.
12
- * Lexical resources (WordNet interface, several POS taggers for English, Stanford taggers for several languages).
9
+ * Text chunkers, sentence segmenters, tokenizers, and parsers (Stanford & Enju).
10
+ * Lexical resources (WordNet interface, several POS taggers for English).
13
11
  * Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
14
- * Serialization of annotated entities to YAML, XML formats or to MongoDB.
12
+ * Word inflectors, including stemmers, conjugators, declensors, and number inflection.
13
+ * Serialization of annotated entities to YAML, XML or to MongoDB.
15
14
  * Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats.
16
15
  * Linguistic resources, including language detection and tag alignments for several treebanks.
17
- * Decision tree and multilayer perceptron classification (liblinear coming soon!)
18
-
19
- <br>
16
+ * Machine learning (decision tree, multilayer perceptron, LIBLINEAR, LIBSVM).
17
+ * Text retrieval with indexation and full-text search (Ferret).
20
18
 
21
- **Resources**
19
+ **Contributing**
22
20
 
23
- * Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
24
- * See how to [install Treat](https://github.com/louismullie/treat/wiki/Installation).
25
- * Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Manual).
26
- * Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing).
27
- * View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
28
- * Open an [issue](https://github.com/louismullie/treat/issues).
29
-
30
- <br>
21
+ I am actively seeking developers that can help maintain and expand this project. You can find a list of ideas for contributing to the project [here](https://github.com/louismullie/treat/wiki/Contributing).
31
22
 
32
23
  **License**
33
24
 
@@ -0,0 +1,44 @@
1
+ # Basic mixin for all the main modules;
2
+ # takes care of requiring the right files
3
+ # in the right order for each one.
4
+ #
5
+ # If a module's folder (e.g. /entities)
6
+ # contains a file with a corresponding
7
+ # singular name (e.g. /entity), that
8
+ # base class is required first. Then,
9
+ # all the files that are found directly
10
+ # under that folder are required (but
11
+ # not those found in sub-folders).
12
+ module Treat::Autoload
13
+
14
+ # Loads all the files for the base
15
+ # module in the appropriate order.
16
+ def self.included(base)
17
+ m = self.get_module_name(base)
18
+ d = self.get_module_path(m)
19
+ n = self.singularize(m) + '.rb'
20
+ f, p = File.join(d, n), "#{d}/*.rb"
21
+ require f if File.readable?(f)
22
+ Dir.glob(p).each { |f| require f }
23
+ end
24
+
25
+ # Returns the path to a module's dir.
26
+ def self.get_module_path(name)
27
+ file = File.expand_path(__FILE__)
28
+ dirs = File.dirname(file).split('/')
29
+ File.join(*dirs[0..-1], name)
30
+ end
31
+
32
+ # Return the downcased form of the
33
+ # module's last name (e.g. "entities").
34
+ def self.get_module_name(mod)
35
+ mod.to_s.split('::')[-1].downcase
36
+ end
37
+
38
+ # Helper method to singularize words.
39
+ def self.singularize(w)
40
+ if w[-3..-1] == 'ies'; w[0..-4] + 'y'
41
+ else; (w[-1] == 's' ? w[0..-2] : w); end
42
+ end
43
+
44
+ end
@@ -0,0 +1,38 @@
1
+ # This module uses structs to represent the
2
+ # configuration options that are stored in
3
+ # the /config folder.
4
+ module Treat::Config
5
+
6
+ # Require configurable mix in.
7
+ require_relative 'importable'
8
+
9
+ # Make all configuration importable.
10
+ extend Treat::Config::Importable
11
+
12
+ # Core configuration options for entities.
13
+ class Treat::Config::Entities; end
14
+
15
+ # Configuration for paths to models, binaries,
16
+ # temporary storage and file downloads.
17
+ class Treat::Config::Paths; end
18
+
19
+ # Configuration for all Treat workers.
20
+ class Treat::Config::Workers; end
21
+
22
+ # Helpful linguistic options.
23
+ class Treat::Config::Linguistics; end
24
+
25
+ # Supported workers for each language.
26
+ class Treat::Config::Languages; end
27
+
28
+ # Configuration options for external libraries.
29
+ class Treat::Config::Libraries; end
30
+
31
+ # Configuration options for database
32
+ # connectivity (host, port, etc.)
33
+ class Treat::Config::Databases; end
34
+
35
+ # Configuration options for Treat core.
36
+ class Treat::Config::Core; end
37
+
38
+ end
@@ -0,0 +1,51 @@
1
+ # Provide default functionality to load configuration
2
+ # options from flat files into their respective modules.
3
+ module Treat::Config::Configurable
4
+
5
+ # When extended, add the .config property to
6
+ # the class that is being operated on.
7
+ def self.extended(base)
8
+ class << base; attr_accessor :config; end
9
+ base.class_eval { self.config = {} }
10
+ end
11
+
12
+ # Provide base functionality to configure
13
+ # all modules. The behaviour is as follows:
14
+ #
15
+ # 1 - Check if a file named data/$CLASS$.rb
16
+ # exists; if so, load that file as the base
17
+ # configuration, i.e. "Treat.$CLASS$"; e.g.
18
+ # "Treat.core"
19
+ #
20
+ # 2 - Check if a folder named data/$CLASS$
21
+ # exists; if so, load each file in that folder
22
+ # as a suboption of the main configuration,
23
+ # i.e. "Treat.$CLASS$.$FILE$"; e.g. "Treat.workers"
24
+ #
25
+ # (where $CLASS$ is the lowercase name of
26
+ # the concrete class being extended by this.)
27
+ def configure!
28
+ path = File.dirname(File.expand_path( # FIXME
29
+ __FILE__)).split('/')[0..-4].join('/') + '/'
30
+ main_dir = path + 'lib/treat/config/data/'
31
+ mod_name = self.name.split('::')[-1].downcase
32
+ conf_dir = main_dir + mod_name
33
+ base_file = main_dir + mod_name + '.rb'
34
+ if File.readable?(base_file)
35
+ self.config = eval(File.read(base_file))
36
+ elsif FileTest.directory?(conf_dir)
37
+ self.config = self.from_dir(conf_dir)
38
+ else; raise Treat::Exception,
39
+ "No config file found for #{mod_name}."
40
+ end
41
+ end
42
+
43
+ # * Helper methods for configuraton * #
44
+ def from_dir(conf_dir)
45
+ Hash[Dir[conf_dir + '/*'].map do |path|
46
+ name = File.basename(path, '.*').intern
47
+ [name, eval(File.read(path))]
48
+ end]
49
+ end
50
+
51
+ end
@@ -0,0 +1,50 @@
1
+ {acronyms:
2
+ ['xml', 'html', 'txt', 'odt',
3
+ 'abw', 'doc', 'yaml', 'uea',
4
+ 'lda', 'pdf', 'ptb', 'dot',
5
+ 'ai', 'id3', 'svo', 'mlp',
6
+ 'svm', 'srx'],
7
+
8
+ encodings:
9
+ {language_to_code: {
10
+ arabic: 'UTF-8',
11
+ chinese: 'GB18030',
12
+ english: 'UTF-8',
13
+ french: 'UTF-8',
14
+ german: 'UTF-8',
15
+ hebrew: 'UTF-8'
16
+ }},
17
+
18
+ entities:
19
+ {list:
20
+ [:entity, :unknown, :email,
21
+ :url, :symbol, :sentence,
22
+ :punctuation, :number,
23
+ :enclitic, :word, :token,
24
+ :fragment, :phrase, :paragraph,
25
+ :title, :zone, :list, :block,
26
+ :page, :section, :collection,
27
+ :document],
28
+ order:
29
+ [:token, :fragment, :phrase,
30
+ :sentence, :zone, :section,
31
+ :document, :collection]},
32
+ language: {
33
+ default: :english,
34
+ detect: false,
35
+ detect_at: :document
36
+ },
37
+ paths: {
38
+ description: {
39
+ tmp: 'temporary files',
40
+ lib: 'class and module definitions',
41
+ bin: 'binary files',
42
+ files: 'user-saved files',
43
+ models: 'model files',
44
+ spec: 'spec test files'
45
+ }
46
+ },
47
+
48
+ syntax: { sweetened: false },
49
+
50
+ verbosity: { debug: false, silence: true}}
@@ -0,0 +1,52 @@
1
+ {
2
+ acronyms:
3
+ ['xml', 'html', 'txt', 'odt',
4
+ 'abw', 'doc', 'yaml', 'uea',
5
+ 'lda', 'pdf', 'ptb', 'dot',
6
+ 'ai', 'id3', 'svo', 'mlp',
7
+ 'svm', 'srx'],
8
+
9
+ encodings:
10
+ {language_to_code: {
11
+ arabic: 'UTF-8',
12
+ chinese: 'GB18030',
13
+ english: 'UTF-8',
14
+ french: 'ISO_8859-1',
15
+ ferman: 'ISO_8859-1',
16
+ hebrew: 'UTF-8'
17
+ }},
18
+
19
+ entities:
20
+ {list:
21
+ [:entity, :unknown, :email,
22
+ :url, :symbol, :sentence,
23
+ :punctuation, :number,
24
+ :enclitic, :word, :token, :group,
25
+ :fragment, :phrase, :paragraph,
26
+ :title, :zone, :list, :block,
27
+ :page, :section, :collection,
28
+ :document],
29
+ order:
30
+ [:token, :fragment, :group,
31
+ :sentence, :zone, :section,
32
+ :document, :collection]},
33
+ language: {
34
+ default: :english,
35
+ detect: false,
36
+ detect_at: :document
37
+ },
38
+ paths: {
39
+ description: {
40
+ tmp: 'temporary files',
41
+ lib: 'class and module definitions',
42
+ bin: 'binary files',
43
+ files: 'user-saved files',
44
+ models: 'model files',
45
+ spec: 'spec test files'
46
+ }
47
+ },
48
+
49
+ syntax: { sweetened: false },
50
+
51
+ verbosity: { debug: false, silence: true}
52
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ default: {
3
+ adapter: :mongo
4
+ },
5
+ mongo: {
6
+ host: 'localhost',
7
+ port: '27017',
8
+ db: nil
9
+ }
10
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ list:
3
+ [:entity, :unknown, :email,
4
+ :url, :symbol, :sentence,
5
+ :punctuation, :number,
6
+ :enclitic, :word, :token,
7
+ :fragment, :phrase, :paragraph,
8
+ :title, :zone, :list, :block,
9
+ :page, :section, :collection,
10
+ :document],
11
+ order:
12
+ [:token, :fragment, :phrase,
13
+ :sentence, :zone, :section,
14
+ :document, :collection]
15
+ }
@@ -0,0 +1,31 @@
1
+ {
2
+ dependencies: [
3
+ 'nokogiri', 'ferret',
4
+ 'bson_ext', 'mongo', 'lda-ruby',
5
+ 'stanford-core-nlp', 'linguistics',
6
+ 'ruby-readability', 'whatlanguage',
7
+ 'chronic', 'nickel', 'decisiontree',
8
+ 'rb-libsvm', 'ruby-fann', 'zip',
9
+ 'tf-idf-similarity', 'narray'
10
+ ],
11
+ workers: {
12
+ learners: {
13
+ classifiers: [:id3, :linear, :mlp, :svm]
14
+ },
15
+ extractors: {
16
+ keywords: [:tf_idf],
17
+ language: [:what_language],
18
+ topic_words: [:lda],
19
+ tf_idf: [:native]
20
+ },
21
+ formatters: {
22
+ serializers: [:xml, :yaml, :mongo],
23
+ unserializers: [:xml, :yaml, :mongo],
24
+ visualizers: [:dot, :standoff, :tree]
25
+ },
26
+ retrievers: {
27
+ searchers: [:ferret],
28
+ indexers: [:ferret]
29
+ }
30
+ }
31
+ }
@@ -6,7 +6,7 @@
6
6
  workers: {
7
7
  processors: {
8
8
  segmenters: [:punkt],
9
- tokenizers: [:tactful]
9
+ tokenizers: []
10
10
  }
11
11
  }
12
12
  }
@@ -0,0 +1,95 @@
1
+ {
2
+ dependencies: [
3
+ 'rbtagger',
4
+ 'ruby-stemmer',
5
+ 'punkt-segmenter',
6
+ 'tactful_tokenizer',
7
+ 'nickel',
8
+ 'rwordnet',
9
+ 'uea-stemmer',
10
+ 'engtagger',
11
+ 'activesupport',
12
+ 'srx-english',
13
+ 'scalpel'
14
+ ],
15
+ workers: {
16
+ extractors: {
17
+ time: [:chronic, :ruby, :nickel],
18
+ topics: [:reuters],
19
+ name_tag: [:stanford]
20
+ },
21
+ inflectors: {
22
+ conjugators: [:linguistics],
23
+ declensors: [:english, :linguistics],
24
+ stemmers: [:porter, :porter_c, :uea],
25
+ ordinalizers: [:linguistics],
26
+ cardinalizers: [:linguistics]
27
+ },
28
+ lexicalizers: {
29
+ taggers: [:lingua, :brill, :stanford],
30
+ sensers: [:wordnet],
31
+ categorizers: [:from_tag]
32
+ },
33
+ processors: {
34
+ parsers: [:stanford],
35
+ segmenters: [:scalpel, :srx, :tactful, :punkt, :stanford],
36
+ tokenizers: [:ptb, :stanford, :punkt]
37
+ }
38
+ },
39
+ stop_words:
40
+ [
41
+ "about",
42
+ "also",
43
+ "are",
44
+ "away",
45
+ "because",
46
+ "been",
47
+ "beside",
48
+ "besides",
49
+ "between",
50
+ "but",
51
+ "cannot",
52
+ "could",
53
+ "did",
54
+ "etc",
55
+ "even",
56
+ "ever",
57
+ "every",
58
+ "for",
59
+ "had",
60
+ "have",
61
+ "how",
62
+ "into",
63
+ "isn",
64
+ "maybe",
65
+ "non",
66
+ "nor",
67
+ "now",
68
+ "should",
69
+ "such",
70
+ "than",
71
+ "that",
72
+ "then",
73
+ "these",
74
+ "this",
75
+ "those",
76
+ "though",
77
+ "too",
78
+ "was",
79
+ "wasn",
80
+ "were",
81
+ "what",
82
+ "when",
83
+ "where",
84
+ "which",
85
+ "while",
86
+ "who",
87
+ "whom",
88
+ "whose",
89
+ "will",
90
+ "with",
91
+ "would",
92
+ "wouldn",
93
+ "yes"
94
+ ]
95
+ }
@@ -0,0 +1,148 @@
1
+ {
2
+ dependencies: [
3
+ 'punkt-segmenter',
4
+ 'tactful_tokenizer',
5
+ 'stanford-core-nlp'
6
+ ],
7
+ workers: {
8
+ processors: {
9
+ segmenters: [:scalpel],
10
+ tokenizers: [:ptb,:stanford],
11
+ parsers: [:stanford]
12
+ },
13
+ lexicalizers: {
14
+ taggers: [:stanford],
15
+ categorizers: [:from_tag]
16
+ }
17
+ },
18
+ stop_words:
19
+ [
20
+ "ailleurs",
21
+ "ainsi",
22
+ "alors",
23
+ "aucun",
24
+ "aucune",
25
+ "auquel",
26
+ "aurai",
27
+ "auras",
28
+ "aurez",
29
+ "aurons",
30
+ "auront",
31
+ "aussi",
32
+ "autre",
33
+ "autres",
34
+ "aux",
35
+ "auxquelles",
36
+ "auxquels",
37
+ "avaient",
38
+ "avais",
39
+ "avait",
40
+ "avec",
41
+ "avez",
42
+ "aviez",
43
+ "avoir",
44
+ "avons",
45
+ "celui",
46
+ "cependant",
47
+ "certaine",
48
+ "certaines",
49
+ "certains",
50
+ "ces",
51
+ "cet",
52
+ "cette",
53
+ "ceux",
54
+ "chacun",
55
+ "chacune",
56
+ "chaque",
57
+ "comme",
58
+ "constamment",
59
+ "davantage",
60
+ "depuis",
61
+ "des",
62
+ "desquelles",
63
+ "desquels",
64
+ "dessous",
65
+ "dessus",
66
+ "donc",
67
+ "dont",
68
+ "duquel",
69
+ "egalement",
70
+ "elles",
71
+ "encore",
72
+ "enfin",
73
+ "ensuite",
74
+ "etaient",
75
+ "etais",
76
+ "etait",
77
+ "etes",
78
+ "etiez",
79
+ "etions",
80
+ "etre",
81
+ "eux",
82
+ "guere",
83
+ "ici",
84
+ "ils",
85
+ "jamais",
86
+ "jusqu",
87
+ "laquelle",
88
+ "legerement",
89
+ "lequel",
90
+ "les",
91
+ "lesquelles",
92
+ "lesquels",
93
+ "leur",
94
+ "leurs",
95
+ "lors",
96
+ "lui",
97
+ "maintenant",
98
+ "mais",
99
+ "malgre",
100
+ "moi",
101
+ "moins",
102
+ "notamment",
103
+ "parce",
104
+ "plupart",
105
+ "pourtant",
106
+ "presentement",
107
+ "presque",
108
+ "puis",
109
+ "puisque",
110
+ "quand",
111
+ "quant",
112
+ "que",
113
+ "quel",
114
+ "quelqu",
115
+ "quelque",
116
+ "quelques",
117
+ "qui",
118
+ "quoi",
119
+ "quoique",
120
+ "rien",
121
+ "selon",
122
+ "serai",
123
+ "seras",
124
+ "serez",
125
+ "serons",
126
+ "seront",
127
+ "soient",
128
+ "soit",
129
+ "sommes",
130
+ "sont",
131
+ "sous",
132
+ "suis",
133
+ "telle",
134
+ "telles",
135
+ "tels",
136
+ "toi",
137
+ "toujours",
138
+ "tout",
139
+ "toutes",
140
+ "tres",
141
+ "trop",
142
+ "une",
143
+ "vos",
144
+ "votre",
145
+ "vous"
146
+ ]
147
+
148
+ }