treat 1.2.0 → 2.0.0rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -2
- data/README.md +12 -21
- data/lib/treat/autoload.rb +44 -0
- data/lib/treat/config/config.rb +38 -0
- data/lib/treat/config/configurable.rb +51 -0
- data/lib/treat/config/data/config.rb +50 -0
- data/lib/treat/config/data/core.rb +52 -0
- data/lib/treat/config/data/databases.rb +10 -0
- data/lib/treat/config/data/entities.rb +15 -0
- data/lib/treat/config/data/languages/agnostic.rb +31 -0
- data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
- data/lib/treat/config/data/languages/english.rb +95 -0
- data/lib/treat/config/data/languages/french.rb +148 -0
- data/lib/treat/config/data/languages/german.rb +135 -0
- data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
- data/lib/treat/config/data/languages/italian.rb +162 -0
- data/lib/treat/config/data/languages/polish.rb +11 -0
- data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
- data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
- data/lib/treat/config/data/languages/spanish.rb +291 -0
- data/lib/treat/config/data/languages/swedish.rb +289 -0
- data/lib/treat/config/data/libraries.rb +12 -0
- data/lib/treat/config/data/linguistics.rb +44 -0
- data/lib/treat/config/data/tags.rb +328 -0
- data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
- data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
- data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
- data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
- data/lib/treat/config/importable.rb +31 -0
- data/lib/treat/config/paths.rb +23 -0
- data/lib/treat/config/tags.rb +37 -0
- data/lib/treat/core/dsl.rb +55 -0
- data/lib/treat/{installer.rb → core/installer.rb} +10 -12
- data/lib/treat/core/server.rb +40 -0
- data/lib/treat/entities/entities.rb +101 -0
- data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
- data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
- data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
- data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
- data/lib/treat/entities/entity/debuggable.rb +86 -0
- data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
- data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
- data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
- data/lib/treat/entities/entity/registrable.rb +36 -0
- data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
- data/lib/treat/entities/entity.rb +86 -77
- data/lib/treat/exception.rb +3 -0
- data/lib/treat/helpers/hash.rb +29 -0
- data/lib/treat/helpers/help.rb +35 -0
- data/lib/treat/helpers/object.rb +55 -0
- data/lib/treat/helpers/string.rb +124 -0
- data/lib/treat/{core → learning}/data_set.rb +11 -11
- data/lib/treat/{core → learning}/export.rb +3 -3
- data/lib/treat/{core → learning}/problem.rb +26 -16
- data/lib/treat/{core → learning}/question.rb +5 -9
- data/lib/treat/loaders/linguistics.rb +8 -9
- data/lib/treat/loaders/stanford.rb +5 -11
- data/lib/treat/modules.rb +33 -0
- data/lib/treat/proxies/array.rb +27 -0
- data/lib/treat/proxies/language.rb +47 -0
- data/lib/treat/proxies/number.rb +18 -0
- data/lib/treat/proxies/proxy.rb +25 -0
- data/lib/treat/proxies/string.rb +18 -0
- data/lib/treat/version.rb +10 -1
- data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
- data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
- data/lib/treat/workers/extractors/language/what_language.rb +8 -6
- data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
- data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
- data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
- data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
- data/lib/treat/workers/extractors/time/chronic.rb +2 -4
- data/lib/treat/workers/extractors/time/nickel.rb +19 -20
- data/lib/treat/workers/extractors/time/ruby.rb +2 -1
- data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
- data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
- data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
- data/lib/treat/workers/formatters/readers/image.rb +19 -9
- data/lib/treat/workers/formatters/readers/odt.rb +2 -1
- data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
- data/lib/treat/workers/formatters/readers/xml.rb +0 -1
- data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
- data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
- data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
- data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
- data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
- data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
- data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
- data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
- data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
- data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
- data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
- data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
- data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
- data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
- data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
- data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
- data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
- data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
- data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
- data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
- data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
- data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
- data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
- data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
- data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
- data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
- data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
- data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
- data/lib/treat/workers/processors/chunkers/html.rb +1 -6
- data/lib/treat/workers/processors/parsers/enju.rb +2 -4
- data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
- data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
- data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
- data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
- data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
- data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
- data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
- data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
- data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
- data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
- data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
- data/lib/treat/workers/workers.rb +6 -0
- data/lib/treat.rb +18 -32
- data/models/MANIFEST +1 -0
- data/spec/core/data_set.rb +174 -0
- data/spec/core/export.rb +52 -0
- data/spec/core/problem.rb +144 -0
- data/spec/core/question.rb +52 -0
- data/spec/{collection.rb → entities/collection.rb} +20 -35
- data/spec/{document.rb → entities/document.rb} +3 -54
- data/spec/{entity.rb → entities/entity.rb} +10 -9
- data/spec/entities/phrase.rb +33 -0
- data/spec/{token.rb → entities/token.rb} +0 -57
- data/spec/entities/word.rb +3 -0
- data/spec/{zone.rb → entities/zone.rb} +0 -26
- data/spec/helper.rb +116 -32
- data/spec/sandbox.rb +258 -25
- data/spec/treat.rb +26 -34
- data/spec/workers/agnostic.rb +137 -0
- data/spec/workers/english.rb +194 -0
- data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
- data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
- data/spec/workers/examples/english/phrase.xml +5 -0
- data/spec/workers/examples/english/test.txt +1 -0
- data/spec/workers/language.rb +280 -0
- data/spec/workers.rb +28 -0
- metadata +122 -105
- data/lib/treat/config/core/acronyms.rb +0 -5
- data/lib/treat/config/core/encodings.rb +0 -8
- data/lib/treat/config/core/entities.rb +0 -2
- data/lib/treat/config/core/language.rb +0 -3
- data/lib/treat/config/core/paths.rb +0 -8
- data/lib/treat/config/core/syntax.rb +0 -1
- data/lib/treat/config/core/verbosity.rb +0 -1
- data/lib/treat/config/databases/default.rb +0 -1
- data/lib/treat/config/databases/mongo.rb +0 -1
- data/lib/treat/config/languages/agnostic.rb +0 -34
- data/lib/treat/config/languages/english.rb +0 -60
- data/lib/treat/config/languages/french.rb +0 -18
- data/lib/treat/config/languages/german.rb +0 -18
- data/lib/treat/config/languages/italian.rb +0 -12
- data/lib/treat/config/languages/polish.rb +0 -12
- data/lib/treat/config/languages/spanish.rb +0 -12
- data/lib/treat/config/languages/swedish.rb +0 -12
- data/lib/treat/config/libraries/punkt.rb +0 -1
- data/lib/treat/config/libraries/reuters.rb +0 -1
- data/lib/treat/config/libraries/stanford.rb +0 -1
- data/lib/treat/config/linguistics/categories.rb +0 -4
- data/lib/treat/config/linguistics/punctuation.rb +0 -33
- data/lib/treat/config/tags/aligned.rb +0 -221
- data/lib/treat/config/tags/enju.rb +0 -71
- data/lib/treat/config/tags/paris7.rb +0 -17
- data/lib/treat/config/tags/ptb.rb +0 -15
- data/lib/treat/config/workers/list.rb +0 -1
- data/lib/treat/config.rb +0 -135
- data/lib/treat/core.rb +0 -5
- data/lib/treat/entities/abilities/copyable.rb +0 -47
- data/lib/treat/entities/abilities/debuggable.rb +0 -83
- data/lib/treat/entities/abilities/registrable.rb +0 -46
- data/lib/treat/entities/collection.rb +0 -40
- data/lib/treat/entities/document.rb +0 -10
- data/lib/treat/entities/group.rb +0 -18
- data/lib/treat/entities/section.rb +0 -13
- data/lib/treat/entities/token.rb +0 -47
- data/lib/treat/entities/zone.rb +0 -12
- data/lib/treat/entities.rb +0 -6
- data/lib/treat/helpers/didyoumean.rb +0 -57
- data/lib/treat/helpers/escaping.rb +0 -15
- data/lib/treat/helpers/formatting.rb +0 -41
- data/lib/treat/helpers/objtohash.rb +0 -8
- data/lib/treat/helpers/platform.rb +0 -15
- data/lib/treat/helpers/reflection.rb +0 -17
- data/lib/treat/helpers/temporary.rb +0 -27
- data/lib/treat/helpers/verbosity.rb +0 -19
- data/lib/treat/helpers.rb +0 -5
- data/lib/treat/loaders.rb +0 -10
- data/lib/treat/proxies.rb +0 -106
- data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
- data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
- data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
- data/spec/core.rb +0 -441
- data/spec/phrase.rb +0 -112
- data/spec/word.rb +0 -111
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Treat - Text Retrieval, Extraction and Annotation Toolkit, v.
|
1
|
+
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 2.0.0rc1
|
2
2
|
|
3
3
|
This program is free software: you can redistribute it and/or modify
|
4
4
|
it under the terms of the GNU General Public License as published by
|
@@ -15,7 +15,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
15
15
|
|
16
16
|
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
|
17
17
|
|
18
|
-
|
18
|
+
A non-trivial amount of code has been incorporated and modified from other libraries:
|
19
19
|
|
20
20
|
- formatters/readers/odt.rb - Mark Watson (GPL license)
|
21
21
|
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
data/README.md
CHANGED
@@ -1,33 +1,24 @@
|
|
1
1
|
[![Build Status](https://secure.travis-ci.org/louismullie/treat.png)](http://travis-ci.org/#!/louismullie/treat)
|
2
|
-
[![
|
3
|
-
|
4
|
-
Treat is a framework for natural language processing and computational linguistics in Ruby. It provides a common API for a number of gems and external libraries for document retrieval, parsing, annotation, and information extraction.
|
2
|
+
[![Code Climate](https://codeclimate.com/badge.png)](https://codeclimate.com/github/louismullie/treat)
|
5
3
|
|
6
|
-
|
4
|
+
Treat is a toolkit for natural language processing and computational linguistics in Ruby. The Treat project aims to build a language- and algorithm- agnostic NLP framework for Ruby with support for tasks such as document retrieval, text chunking, segmentation and tokenization, natural language parsing, part-of-speech tagging, keyword extraction and named entity recognition. Learn more by taking a [quick tour](https://github.com/louismullie/treat/wiki/Quick-Tour) or by reading the [manual](https://github.com/louismullie/treat/wiki/Manual).
|
5
|
+
|
6
|
+
**Features**
|
7
7
|
|
8
8
|
* Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus).
|
9
|
-
* Text
|
10
|
-
*
|
11
|
-
* Word inflectors, including stemmers, conjugators, declensors, and number inflection.
|
12
|
-
* Lexical resources (WordNet interface, several POS taggers for English, Stanford taggers for several languages).
|
9
|
+
* Text chunkers, sentence segmenters, tokenizers, and parsers (Stanford & Enju).
|
10
|
+
* Lexical resources (WordNet interface, several POS taggers for English).
|
13
11
|
* Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
|
14
|
-
*
|
12
|
+
* Word inflectors, including stemmers, conjugators, declensors, and number inflection.
|
13
|
+
* Serialization of annotated entities to YAML, XML or to MongoDB.
|
15
14
|
* Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats.
|
16
15
|
* Linguistic resources, including language detection and tag alignments for several treebanks.
|
17
|
-
*
|
18
|
-
|
19
|
-
<br>
|
16
|
+
* Machine learning (decision tree, multilayer perceptron, LIBLINEAR, LIBSVM).
|
17
|
+
* Text retrieval with indexation and full-text search (Ferret).
|
20
18
|
|
21
|
-
**
|
19
|
+
**Contributing**
|
22
20
|
|
23
|
-
|
24
|
-
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installation).
|
25
|
-
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Manual).
|
26
|
-
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing).
|
27
|
-
* View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
|
28
|
-
* Open an [issue](https://github.com/louismullie/treat/issues).
|
29
|
-
|
30
|
-
<br>
|
21
|
+
I am actively seeking developers that can help maintain and expand this project. You can find a list of ideas for contributing to the project [here](https://github.com/louismullie/treat/wiki/Contributing).
|
31
22
|
|
32
23
|
**License**
|
33
24
|
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# Basic mixin for all the main modules;
|
2
|
+
# takes care of requiring the right files
|
3
|
+
# in the right order for each one.
|
4
|
+
#
|
5
|
+
# If a module's folder (e.g. /entities)
|
6
|
+
# contains a file with a corresponding
|
7
|
+
# singular name (e.g. /entity), that
|
8
|
+
# base class is required first. Then,
|
9
|
+
# all the files that are found directly
|
10
|
+
# under that folder are required (but
|
11
|
+
# not those found in sub-folders).
|
12
|
+
module Treat::Autoload
|
13
|
+
|
14
|
+
# Loads all the files for the base
|
15
|
+
# module in the appropriate order.
|
16
|
+
def self.included(base)
|
17
|
+
m = self.get_module_name(base)
|
18
|
+
d = self.get_module_path(m)
|
19
|
+
n = self.singularize(m) + '.rb'
|
20
|
+
f, p = File.join(d, n), "#{d}/*.rb"
|
21
|
+
require f if File.readable?(f)
|
22
|
+
Dir.glob(p).each { |f| require f }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns the path to a module's dir.
|
26
|
+
def self.get_module_path(name)
|
27
|
+
file = File.expand_path(__FILE__)
|
28
|
+
dirs = File.dirname(file).split('/')
|
29
|
+
File.join(*dirs[0..-1], name)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return the downcased form of the
|
33
|
+
# module's last name (e.g. "entities").
|
34
|
+
def self.get_module_name(mod)
|
35
|
+
mod.to_s.split('::')[-1].downcase
|
36
|
+
end
|
37
|
+
|
38
|
+
# Helper method to singularize words.
|
39
|
+
def self.singularize(w)
|
40
|
+
if w[-3..-1] == 'ies'; w[0..-4] + 'y'
|
41
|
+
else; (w[-1] == 's' ? w[0..-2] : w); end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# This module uses structs to represent the
|
2
|
+
# configuration options that are stored in
|
3
|
+
# the /config folder.
|
4
|
+
module Treat::Config
|
5
|
+
|
6
|
+
# Require configurable mix in.
|
7
|
+
require_relative 'importable'
|
8
|
+
|
9
|
+
# Make all configuration importable.
|
10
|
+
extend Treat::Config::Importable
|
11
|
+
|
12
|
+
# Core configuration options for entities.
|
13
|
+
class Treat::Config::Entities; end
|
14
|
+
|
15
|
+
# Configuration for paths to models, binaries,
|
16
|
+
# temporary storage and file downloads.
|
17
|
+
class Treat::Config::Paths; end
|
18
|
+
|
19
|
+
# Configuration for all Treat workers.
|
20
|
+
class Treat::Config::Workers; end
|
21
|
+
|
22
|
+
# Helpful linguistic options.
|
23
|
+
class Treat::Config::Linguistics; end
|
24
|
+
|
25
|
+
# Supported workers for each language.
|
26
|
+
class Treat::Config::Languages; end
|
27
|
+
|
28
|
+
# Configuration options for external libraries.
|
29
|
+
class Treat::Config::Libraries; end
|
30
|
+
|
31
|
+
# Configuration options for database
|
32
|
+
# connectivity (host, port, etc.)
|
33
|
+
class Treat::Config::Databases; end
|
34
|
+
|
35
|
+
# Configuration options for Treat core.
|
36
|
+
class Treat::Config::Core; end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# Provide default functionality to load configuration
|
2
|
+
# options from flat files into their respective modules.
|
3
|
+
module Treat::Config::Configurable
|
4
|
+
|
5
|
+
# When extended, add the .config property to
|
6
|
+
# the class that is being operated on.
|
7
|
+
def self.extended(base)
|
8
|
+
class << base; attr_accessor :config; end
|
9
|
+
base.class_eval { self.config = {} }
|
10
|
+
end
|
11
|
+
|
12
|
+
# Provide base functionality to configure
|
13
|
+
# all modules. The behaviour is as follows:
|
14
|
+
#
|
15
|
+
# 1 - Check if a file named data/$CLASS$.rb
|
16
|
+
# exists; if so, load that file as the base
|
17
|
+
# configuration, i.e. "Treat.$CLASS$"; e.g.
|
18
|
+
# "Treat.core"
|
19
|
+
#
|
20
|
+
# 2 - Check if a folder named data/$CLASS$
|
21
|
+
# exists; if so, load each file in that folder
|
22
|
+
# as a suboption of the main configuration,
|
23
|
+
# i.e. "Treat.$CLASS$.$FILE$"; e.g. "Treat.workers"
|
24
|
+
#
|
25
|
+
# (where $CLASS$ is the lowercase name of
|
26
|
+
# the concrete class being extended by this.)
|
27
|
+
def configure!
|
28
|
+
path = File.dirname(File.expand_path( # FIXME
|
29
|
+
__FILE__)).split('/')[0..-4].join('/') + '/'
|
30
|
+
main_dir = path + 'lib/treat/config/data/'
|
31
|
+
mod_name = self.name.split('::')[-1].downcase
|
32
|
+
conf_dir = main_dir + mod_name
|
33
|
+
base_file = main_dir + mod_name + '.rb'
|
34
|
+
if File.readable?(base_file)
|
35
|
+
self.config = eval(File.read(base_file))
|
36
|
+
elsif FileTest.directory?(conf_dir)
|
37
|
+
self.config = self.from_dir(conf_dir)
|
38
|
+
else; raise Treat::Exception,
|
39
|
+
"No config file found for #{mod_name}."
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# * Helper methods for configuraton * #
|
44
|
+
def from_dir(conf_dir)
|
45
|
+
Hash[Dir[conf_dir + '/*'].map do |path|
|
46
|
+
name = File.basename(path, '.*').intern
|
47
|
+
[name, eval(File.read(path))]
|
48
|
+
end]
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
{acronyms:
|
2
|
+
['xml', 'html', 'txt', 'odt',
|
3
|
+
'abw', 'doc', 'yaml', 'uea',
|
4
|
+
'lda', 'pdf', 'ptb', 'dot',
|
5
|
+
'ai', 'id3', 'svo', 'mlp',
|
6
|
+
'svm', 'srx'],
|
7
|
+
|
8
|
+
encodings:
|
9
|
+
{language_to_code: {
|
10
|
+
arabic: 'UTF-8',
|
11
|
+
chinese: 'GB18030',
|
12
|
+
english: 'UTF-8',
|
13
|
+
french: 'UTF-8',
|
14
|
+
german: 'UTF-8',
|
15
|
+
hebrew: 'UTF-8'
|
16
|
+
}},
|
17
|
+
|
18
|
+
entities:
|
19
|
+
{list:
|
20
|
+
[:entity, :unknown, :email,
|
21
|
+
:url, :symbol, :sentence,
|
22
|
+
:punctuation, :number,
|
23
|
+
:enclitic, :word, :token,
|
24
|
+
:fragment, :phrase, :paragraph,
|
25
|
+
:title, :zone, :list, :block,
|
26
|
+
:page, :section, :collection,
|
27
|
+
:document],
|
28
|
+
order:
|
29
|
+
[:token, :fragment, :phrase,
|
30
|
+
:sentence, :zone, :section,
|
31
|
+
:document, :collection]},
|
32
|
+
language: {
|
33
|
+
default: :english,
|
34
|
+
detect: false,
|
35
|
+
detect_at: :document
|
36
|
+
},
|
37
|
+
paths: {
|
38
|
+
description: {
|
39
|
+
tmp: 'temporary files',
|
40
|
+
lib: 'class and module definitions',
|
41
|
+
bin: 'binary files',
|
42
|
+
files: 'user-saved files',
|
43
|
+
models: 'model files',
|
44
|
+
spec: 'spec test files'
|
45
|
+
}
|
46
|
+
},
|
47
|
+
|
48
|
+
syntax: { sweetened: false },
|
49
|
+
|
50
|
+
verbosity: { debug: false, silence: true}}
|
@@ -0,0 +1,52 @@
|
|
1
|
+
{
|
2
|
+
acronyms:
|
3
|
+
['xml', 'html', 'txt', 'odt',
|
4
|
+
'abw', 'doc', 'yaml', 'uea',
|
5
|
+
'lda', 'pdf', 'ptb', 'dot',
|
6
|
+
'ai', 'id3', 'svo', 'mlp',
|
7
|
+
'svm', 'srx'],
|
8
|
+
|
9
|
+
encodings:
|
10
|
+
{language_to_code: {
|
11
|
+
arabic: 'UTF-8',
|
12
|
+
chinese: 'GB18030',
|
13
|
+
english: 'UTF-8',
|
14
|
+
french: 'ISO_8859-1',
|
15
|
+
ferman: 'ISO_8859-1',
|
16
|
+
hebrew: 'UTF-8'
|
17
|
+
}},
|
18
|
+
|
19
|
+
entities:
|
20
|
+
{list:
|
21
|
+
[:entity, :unknown, :email,
|
22
|
+
:url, :symbol, :sentence,
|
23
|
+
:punctuation, :number,
|
24
|
+
:enclitic, :word, :token, :group,
|
25
|
+
:fragment, :phrase, :paragraph,
|
26
|
+
:title, :zone, :list, :block,
|
27
|
+
:page, :section, :collection,
|
28
|
+
:document],
|
29
|
+
order:
|
30
|
+
[:token, :fragment, :group,
|
31
|
+
:sentence, :zone, :section,
|
32
|
+
:document, :collection]},
|
33
|
+
language: {
|
34
|
+
default: :english,
|
35
|
+
detect: false,
|
36
|
+
detect_at: :document
|
37
|
+
},
|
38
|
+
paths: {
|
39
|
+
description: {
|
40
|
+
tmp: 'temporary files',
|
41
|
+
lib: 'class and module definitions',
|
42
|
+
bin: 'binary files',
|
43
|
+
files: 'user-saved files',
|
44
|
+
models: 'model files',
|
45
|
+
spec: 'spec test files'
|
46
|
+
}
|
47
|
+
},
|
48
|
+
|
49
|
+
syntax: { sweetened: false },
|
50
|
+
|
51
|
+
verbosity: { debug: false, silence: true}
|
52
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
{
|
2
|
+
list:
|
3
|
+
[:entity, :unknown, :email,
|
4
|
+
:url, :symbol, :sentence,
|
5
|
+
:punctuation, :number,
|
6
|
+
:enclitic, :word, :token,
|
7
|
+
:fragment, :phrase, :paragraph,
|
8
|
+
:title, :zone, :list, :block,
|
9
|
+
:page, :section, :collection,
|
10
|
+
:document],
|
11
|
+
order:
|
12
|
+
[:token, :fragment, :phrase,
|
13
|
+
:sentence, :zone, :section,
|
14
|
+
:document, :collection]
|
15
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
{
|
2
|
+
dependencies: [
|
3
|
+
'nokogiri', 'ferret',
|
4
|
+
'bson_ext', 'mongo', 'lda-ruby',
|
5
|
+
'stanford-core-nlp', 'linguistics',
|
6
|
+
'ruby-readability', 'whatlanguage',
|
7
|
+
'chronic', 'nickel', 'decisiontree',
|
8
|
+
'rb-libsvm', 'ruby-fann', 'zip',
|
9
|
+
'tf-idf-similarity', 'narray'
|
10
|
+
],
|
11
|
+
workers: {
|
12
|
+
learners: {
|
13
|
+
classifiers: [:id3, :linear, :mlp, :svm]
|
14
|
+
},
|
15
|
+
extractors: {
|
16
|
+
keywords: [:tf_idf],
|
17
|
+
language: [:what_language],
|
18
|
+
topic_words: [:lda],
|
19
|
+
tf_idf: [:native]
|
20
|
+
},
|
21
|
+
formatters: {
|
22
|
+
serializers: [:xml, :yaml, :mongo],
|
23
|
+
unserializers: [:xml, :yaml, :mongo],
|
24
|
+
visualizers: [:dot, :standoff, :tree]
|
25
|
+
},
|
26
|
+
retrievers: {
|
27
|
+
searchers: [:ferret],
|
28
|
+
indexers: [:ferret]
|
29
|
+
}
|
30
|
+
}
|
31
|
+
}
|
File without changes
|
File without changes
|
@@ -0,0 +1,95 @@
|
|
1
|
+
{
|
2
|
+
dependencies: [
|
3
|
+
'rbtagger',
|
4
|
+
'ruby-stemmer',
|
5
|
+
'punkt-segmenter',
|
6
|
+
'tactful_tokenizer',
|
7
|
+
'nickel',
|
8
|
+
'rwordnet',
|
9
|
+
'uea-stemmer',
|
10
|
+
'engtagger',
|
11
|
+
'activesupport',
|
12
|
+
'srx-english',
|
13
|
+
'scalpel'
|
14
|
+
],
|
15
|
+
workers: {
|
16
|
+
extractors: {
|
17
|
+
time: [:chronic, :ruby, :nickel],
|
18
|
+
topics: [:reuters],
|
19
|
+
name_tag: [:stanford]
|
20
|
+
},
|
21
|
+
inflectors: {
|
22
|
+
conjugators: [:linguistics],
|
23
|
+
declensors: [:english, :linguistics],
|
24
|
+
stemmers: [:porter, :porter_c, :uea],
|
25
|
+
ordinalizers: [:linguistics],
|
26
|
+
cardinalizers: [:linguistics]
|
27
|
+
},
|
28
|
+
lexicalizers: {
|
29
|
+
taggers: [:lingua, :brill, :stanford],
|
30
|
+
sensers: [:wordnet],
|
31
|
+
categorizers: [:from_tag]
|
32
|
+
},
|
33
|
+
processors: {
|
34
|
+
parsers: [:stanford],
|
35
|
+
segmenters: [:scalpel, :srx, :tactful, :punkt, :stanford],
|
36
|
+
tokenizers: [:ptb, :stanford, :punkt]
|
37
|
+
}
|
38
|
+
},
|
39
|
+
stop_words:
|
40
|
+
[
|
41
|
+
"about",
|
42
|
+
"also",
|
43
|
+
"are",
|
44
|
+
"away",
|
45
|
+
"because",
|
46
|
+
"been",
|
47
|
+
"beside",
|
48
|
+
"besides",
|
49
|
+
"between",
|
50
|
+
"but",
|
51
|
+
"cannot",
|
52
|
+
"could",
|
53
|
+
"did",
|
54
|
+
"etc",
|
55
|
+
"even",
|
56
|
+
"ever",
|
57
|
+
"every",
|
58
|
+
"for",
|
59
|
+
"had",
|
60
|
+
"have",
|
61
|
+
"how",
|
62
|
+
"into",
|
63
|
+
"isn",
|
64
|
+
"maybe",
|
65
|
+
"non",
|
66
|
+
"nor",
|
67
|
+
"now",
|
68
|
+
"should",
|
69
|
+
"such",
|
70
|
+
"than",
|
71
|
+
"that",
|
72
|
+
"then",
|
73
|
+
"these",
|
74
|
+
"this",
|
75
|
+
"those",
|
76
|
+
"though",
|
77
|
+
"too",
|
78
|
+
"was",
|
79
|
+
"wasn",
|
80
|
+
"were",
|
81
|
+
"what",
|
82
|
+
"when",
|
83
|
+
"where",
|
84
|
+
"which",
|
85
|
+
"while",
|
86
|
+
"who",
|
87
|
+
"whom",
|
88
|
+
"whose",
|
89
|
+
"will",
|
90
|
+
"with",
|
91
|
+
"would",
|
92
|
+
"wouldn",
|
93
|
+
"yes"
|
94
|
+
]
|
95
|
+
}
|
@@ -0,0 +1,148 @@
|
|
1
|
+
{
|
2
|
+
dependencies: [
|
3
|
+
'punkt-segmenter',
|
4
|
+
'tactful_tokenizer',
|
5
|
+
'stanford-core-nlp'
|
6
|
+
],
|
7
|
+
workers: {
|
8
|
+
processors: {
|
9
|
+
segmenters: [:scalpel],
|
10
|
+
tokenizers: [:ptb,:stanford],
|
11
|
+
parsers: [:stanford]
|
12
|
+
},
|
13
|
+
lexicalizers: {
|
14
|
+
taggers: [:stanford],
|
15
|
+
categorizers: [:from_tag]
|
16
|
+
}
|
17
|
+
},
|
18
|
+
stop_words:
|
19
|
+
[
|
20
|
+
"ailleurs",
|
21
|
+
"ainsi",
|
22
|
+
"alors",
|
23
|
+
"aucun",
|
24
|
+
"aucune",
|
25
|
+
"auquel",
|
26
|
+
"aurai",
|
27
|
+
"auras",
|
28
|
+
"aurez",
|
29
|
+
"aurons",
|
30
|
+
"auront",
|
31
|
+
"aussi",
|
32
|
+
"autre",
|
33
|
+
"autres",
|
34
|
+
"aux",
|
35
|
+
"auxquelles",
|
36
|
+
"auxquels",
|
37
|
+
"avaient",
|
38
|
+
"avais",
|
39
|
+
"avait",
|
40
|
+
"avec",
|
41
|
+
"avez",
|
42
|
+
"aviez",
|
43
|
+
"avoir",
|
44
|
+
"avons",
|
45
|
+
"celui",
|
46
|
+
"cependant",
|
47
|
+
"certaine",
|
48
|
+
"certaines",
|
49
|
+
"certains",
|
50
|
+
"ces",
|
51
|
+
"cet",
|
52
|
+
"cette",
|
53
|
+
"ceux",
|
54
|
+
"chacun",
|
55
|
+
"chacune",
|
56
|
+
"chaque",
|
57
|
+
"comme",
|
58
|
+
"constamment",
|
59
|
+
"davantage",
|
60
|
+
"depuis",
|
61
|
+
"des",
|
62
|
+
"desquelles",
|
63
|
+
"desquels",
|
64
|
+
"dessous",
|
65
|
+
"dessus",
|
66
|
+
"donc",
|
67
|
+
"dont",
|
68
|
+
"duquel",
|
69
|
+
"egalement",
|
70
|
+
"elles",
|
71
|
+
"encore",
|
72
|
+
"enfin",
|
73
|
+
"ensuite",
|
74
|
+
"etaient",
|
75
|
+
"etais",
|
76
|
+
"etait",
|
77
|
+
"etes",
|
78
|
+
"etiez",
|
79
|
+
"etions",
|
80
|
+
"etre",
|
81
|
+
"eux",
|
82
|
+
"guere",
|
83
|
+
"ici",
|
84
|
+
"ils",
|
85
|
+
"jamais",
|
86
|
+
"jusqu",
|
87
|
+
"laquelle",
|
88
|
+
"legerement",
|
89
|
+
"lequel",
|
90
|
+
"les",
|
91
|
+
"lesquelles",
|
92
|
+
"lesquels",
|
93
|
+
"leur",
|
94
|
+
"leurs",
|
95
|
+
"lors",
|
96
|
+
"lui",
|
97
|
+
"maintenant",
|
98
|
+
"mais",
|
99
|
+
"malgre",
|
100
|
+
"moi",
|
101
|
+
"moins",
|
102
|
+
"notamment",
|
103
|
+
"parce",
|
104
|
+
"plupart",
|
105
|
+
"pourtant",
|
106
|
+
"presentement",
|
107
|
+
"presque",
|
108
|
+
"puis",
|
109
|
+
"puisque",
|
110
|
+
"quand",
|
111
|
+
"quant",
|
112
|
+
"que",
|
113
|
+
"quel",
|
114
|
+
"quelqu",
|
115
|
+
"quelque",
|
116
|
+
"quelques",
|
117
|
+
"qui",
|
118
|
+
"quoi",
|
119
|
+
"quoique",
|
120
|
+
"rien",
|
121
|
+
"selon",
|
122
|
+
"serai",
|
123
|
+
"seras",
|
124
|
+
"serez",
|
125
|
+
"serons",
|
126
|
+
"seront",
|
127
|
+
"soient",
|
128
|
+
"soit",
|
129
|
+
"sommes",
|
130
|
+
"sont",
|
131
|
+
"sous",
|
132
|
+
"suis",
|
133
|
+
"telle",
|
134
|
+
"telles",
|
135
|
+
"tels",
|
136
|
+
"toi",
|
137
|
+
"toujours",
|
138
|
+
"tout",
|
139
|
+
"toutes",
|
140
|
+
"tres",
|
141
|
+
"trop",
|
142
|
+
"une",
|
143
|
+
"vos",
|
144
|
+
"votre",
|
145
|
+
"vous"
|
146
|
+
]
|
147
|
+
|
148
|
+
}
|