treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Treat - Text Retrieval and Annotation Toolkit
|
1
|
+
Treat - Text Retrieval, Extraction and Annotation Toolkit
|
2
2
|
|
3
3
|
This program is free software: you can redistribute it and/or modify
|
4
4
|
it under the terms of the GNU General Public License as published by
|
@@ -11,9 +11,9 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11
11
|
GNU General Public License for more details.
|
12
12
|
|
13
13
|
You should have received a copy of the GNU General Public License
|
14
|
-
along with this program.
|
14
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
15
|
|
16
|
-
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright
|
16
|
+
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
|
17
17
|
|
18
18
|
Non-trivial amount of code has been incorporated and modified from other libraries:
|
19
19
|
|
data/README.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
 
|
2
|
+
|
3
|
+
Treat is a toolkit for natural language processing and computational linguistics in Ruby. It provides a common API for a number of gems and external libraries for document retrieval, parsing, annotation, and information extraction.
|
4
|
+
|
5
|
+
**Current features**
|
6
|
+
|
7
|
+
* Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus)
|
8
|
+
* Text retrieval with indexation and full-text search (Ferret)
|
9
|
+
* Text chunkers, sentence segmenters, tokenizers, and parsers for several languages (Stanford & Enju)
|
10
|
+
* Word inflectors, including stemmers, conjugators, declensors, and number inflection
|
11
|
+
* Lexical resources (WordNet interface, several POS taggers for English, Stanford taggers for several languages)
|
12
|
+
* Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
|
13
|
+
* Simple text statistics (frequency, TF*IDF)
|
14
|
+
* Serialization of annotated entities to YAML or XML format
|
15
|
+
* Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats
|
16
|
+
* Linguistic resources, including full ISO-639-1 and ISO-639-2 support, and tag alignments for several treebanks
|
17
|
+
|
18
|
+
<br>
|
19
|
+
|
20
|
+
**Resources**
|
21
|
+
|
22
|
+
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/master/frames).
|
23
|
+
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
|
24
|
+
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
|
25
|
+
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).
|
26
|
+
* View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
|
27
|
+
* Open an [issue](https://github.com/louismullie/treat/issues).
|
28
|
+
|
29
|
+
<br>
|
30
|
+
|
31
|
+
**License**
|
32
|
+
|
33
|
+
This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
|
data/files/INFO
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
This is a folder containing the files downloaded by Treat.
|
data/lib/treat.rb
CHANGED
@@ -1,127 +1,62 @@
|
|
1
|
-
# Main namespace for Treat modules.
|
2
|
-
#
|
3
|
-
# === Entities
|
4
|
-
#
|
5
|
-
# Entities are Tree structures that represent textual entities
|
6
|
-
# (from a collection of texts down to an individual word), with
|
7
|
-
# a value, features, children and dependencies linking it to other
|
8
|
-
# textual entities.
|
9
|
-
#
|
10
|
-
# Here are some example of how to create entities:
|
11
|
-
#
|
12
|
-
# Treat.sweeten!
|
13
|
-
#
|
14
|
-
# c = Collection 'folder_with_documents'
|
15
|
-
# d = Document 'filename.txt'
|
16
|
-
# p = Paragraph 'A short story. The end.'
|
17
|
-
# s = Sentence 'That is not a sentence.'
|
18
|
-
# w = Word 'fox'
|
19
|
-
#
|
20
|
-
# Here is a list of entities and their description:
|
21
|
-
#
|
22
|
-
# - A Collection represents a folder with different textual documents.
|
23
|
-
# - A Document represents a file with a textual content.
|
24
|
-
# - A Zone can be a Section, Title, a Paragraph or a List and represents an intra-section division of content.
|
25
|
-
# - A Sentence represents just that.
|
26
|
-
# - A Phrase is a group of words; a Sentence is a Phrase with a sentence ender (.!?)
|
27
|
-
# - A Token can be a Word, a Number, a Punctuation or a Symbol (non-punctuation, non-alphanumeric character).
|
28
|
-
#
|
29
|
-
# === Functions
|
30
|
-
#
|
31
|
-
# A worker class is defined for each implemented algorithm performing a given
|
32
|
-
# task. These classes are clustered into workers performing the same given task
|
33
|
-
# differently (Group), and the groups are clustered into Categories
|
34
|
-
# of groups of workers that perform related tasks.
|
35
|
-
#
|
36
|
-
# Here are the different Categories and their description:
|
37
|
-
#
|
38
|
-
# - Processors perform the building of tree of entities representing texts (chunking, segmenting, tokenizing, parsing).
|
39
|
-
# - Lexicalizers give lexical information about words (synsets, semantic relationships, tag, word category).
|
40
|
-
# - Extractors extract semantic information about an entity (topic, date, time, named entity).
|
41
|
-
# - Inflectors allow to retrieve the different inflections of a word (declensors, conjugators, stemmers, lemmatizers).
|
42
|
-
# - Formatters handle the conversion of entities to and from different formats (readers, serializers, unserializers, visualizers).
|
43
|
-
# - Retrievers allow to index and search collections of documents.
|
44
|
-
#
|
45
|
-
# === Linguistic Resources
|
46
|
-
#
|
47
|
-
# The Languages module contains linguistic information about
|
48
|
-
# languages (full ISO-639-1 and 2 language list, tag alignments
|
49
|
-
# for three treebanks, word categories, etc.)
|
50
|
-
#
|
51
|
-
# === Exception Class.
|
52
|
-
#
|
53
|
-
# Treat::Exception defines a custom exception class for the Treat module.
|
54
|
-
#
|
55
1
|
module Treat
|
2
|
+
|
3
|
+
# Require custom exception cass.
|
4
|
+
require 'treat/exception'
|
56
5
|
|
57
|
-
#
|
6
|
+
# Treat requires Ruby 1.9 or higher.
|
58
7
|
if RUBY_VERSION <= '1.9'
|
59
|
-
raise
|
8
|
+
raise Treat::Exception,
|
9
|
+
'Treat requires Ruby 1.9 or higher.'
|
60
10
|
end
|
61
11
|
|
62
12
|
# The current version of Treat.
|
63
|
-
VERSION = "0.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
13
|
+
VERSION = "1.0.0"
|
14
|
+
|
15
|
+
# Add methods to handle syntactic sugar,
|
16
|
+
# language configuration options, and paths.
|
17
|
+
require 'treat/configurable'
|
18
|
+
extend Treat::Configurable
|
19
|
+
|
20
|
+
# The folders in the library and descriptions.
|
21
|
+
Paths = {
|
22
|
+
:tmp => 'temporary files',
|
23
|
+
:lib => 'class and module definitions',
|
24
|
+
:bin => 'binary files',
|
25
|
+
:files => 'user-saved files',
|
26
|
+
:data => 'data set files',
|
27
|
+
:models => 'model files',
|
28
|
+
:spec => 'spec test files'
|
29
|
+
}
|
30
|
+
|
31
|
+
# Add methods to provide access to common paths.
|
68
32
|
class << self
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
# Symbol - the ideal entity level to detect language at
|
78
|
-
# (e.g., :entity, :sentence, :zone, :section, :document)
|
79
|
-
attr_accessor :language_detection_level
|
80
|
-
# String - folder of this file.
|
81
|
-
attr_accessor :lib
|
82
|
-
# String - folder for tests.
|
83
|
-
attr_accessor :test
|
33
|
+
Paths.each do |path, _|
|
34
|
+
define_method(path) do
|
35
|
+
(File.dirname(__FILE__).
|
36
|
+
split('/')[0..-2].join('/') +
|
37
|
+
'/' + path.to_s + '/').gsub(
|
38
|
+
'lib/../', '')
|
39
|
+
end
|
40
|
+
end
|
84
41
|
end
|
85
42
|
|
86
|
-
# Turn off debug by default.
|
87
|
-
self.debug = false
|
88
|
-
# Set the default language to english.
|
89
|
-
self.default_language = :eng
|
90
|
-
# Set the default encoding to utf-8.
|
91
|
-
self.default_encoding = :utf_8
|
92
|
-
# Turn language detection off by default.
|
93
|
-
self.detect_language = false
|
94
|
-
# Detect the language once per text by default.
|
95
|
-
self.language_detection_level = :zone
|
96
|
-
# Set the lib path to that of this file.
|
97
|
-
self.lib = File.dirname(__FILE__)
|
98
|
-
# Set the paths to the test folder.
|
99
|
-
self.test = self.lib + '/../test'
|
100
|
-
|
101
|
-
# Require inline C
|
102
|
-
# require 'inline'
|
103
|
-
|
104
|
-
# Require modified core classes.
|
105
43
|
require 'treat/object'
|
106
44
|
require 'treat/kernel'
|
107
|
-
|
108
|
-
require 'treat/exception'
|
45
|
+
require 'treat/downloader'
|
109
46
|
require 'treat/languages'
|
47
|
+
require 'treat/linguistics'
|
110
48
|
require 'treat/entities'
|
111
49
|
require 'treat/categories'
|
50
|
+
require 'treat/data_set'
|
112
51
|
require 'treat/proxies'
|
113
|
-
require 'treat/sugar'
|
114
|
-
|
115
|
-
# Make sugar available when needed.
|
116
|
-
extend Treat::Sugar
|
117
52
|
|
118
53
|
# Install packages for a given language.
|
119
54
|
def self.install(language = :english)
|
120
|
-
require 'treat/
|
55
|
+
require 'treat/installer'
|
121
56
|
Treat::Installer.install(language)
|
122
57
|
end
|
123
58
|
|
124
|
-
#
|
125
|
-
|
126
|
-
|
59
|
+
# Enable syntactic sugar by default.
|
60
|
+
Treat.sweeten!
|
61
|
+
|
127
62
|
end
|
data/lib/treat/ai.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
class Treat::AI::Classifiers::ID3
|
2
|
+
|
3
|
+
require 'decisiontree'
|
4
|
+
|
5
|
+
@@classifiers = {}
|
6
|
+
|
7
|
+
def self.classify(entity, options = {})
|
8
|
+
|
9
|
+
set = options[:training]
|
10
|
+
cl = set.classification
|
11
|
+
|
12
|
+
if !@@classifiers[cl]
|
13
|
+
dec_tree = DecisionTree::ID3Tree.new(
|
14
|
+
set.labels, set.items,
|
15
|
+
cl.default, :continuous)
|
16
|
+
dec_tree.train
|
17
|
+
else
|
18
|
+
dec_tree = @@classifiers[cl]
|
19
|
+
end
|
20
|
+
|
21
|
+
dec_tree.predict(
|
22
|
+
cl.export_item(entity, false)
|
23
|
+
)[0]
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
data/lib/treat/categories.rb
CHANGED
@@ -1,43 +1,90 @@
|
|
1
|
-
module Treat
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
1
|
+
# This module keeps track of all the Treat::Categorizable
|
2
|
+
# modules that exist and the methods they define.
|
3
|
+
#
|
4
|
+
#
|
5
|
+
# - Processors perform the building of tree of
|
6
|
+
# entities representing texts (chunking,
|
7
|
+
# segmenting, tokenizing, parsing).
|
8
|
+
# - Lexicalizers give lexical information about
|
9
|
+
# words (synsets, semantic relationships,
|
10
|
+
# tag, word category).
|
11
|
+
# - Extractors extract semantic information about
|
12
|
+
# an entity (language, topic, date, time, named
|
13
|
+
# entity, coreferences).
|
14
|
+
# - Inflectors allow to retrieve the different
|
15
|
+
# inflections of a word (declensors, conjugators,
|
16
|
+
# stemmers, lemmatizers).
|
17
|
+
# - Formatters handle the conversion of entities to
|
18
|
+
# and from different formats(readers, serializers,
|
19
|
+
# unserializers, visualizers).
|
20
|
+
# - Retrievers allow to index and search collections
|
21
|
+
# of documents.
|
22
|
+
module Treat::Categories
|
23
|
+
|
24
|
+
class << self
|
25
|
+
# A list of all categories.
|
26
|
+
attr_accessor :list
|
27
|
+
end
|
28
|
+
|
29
|
+
# Array - list of all categories.
|
30
|
+
self.list = []
|
31
|
+
# A lookup table for entity types.
|
32
|
+
@@lookup = {}
|
33
|
+
|
34
|
+
# Require all categories.
|
35
|
+
require 'treat/categorizable'
|
36
|
+
require 'treat/formatters'
|
37
|
+
require 'treat/processors'
|
38
|
+
require 'treat/lexicalizers'
|
39
|
+
require 'treat/inflectors'
|
40
|
+
require 'treat/extractors'
|
41
|
+
require 'treat/retrievers'
|
42
|
+
require 'treat/ai'
|
43
|
+
|
44
|
+
# Create the lookup table.
|
45
|
+
self.list.each do |category|
|
46
|
+
category.groups.each do |group|
|
47
|
+
group = category.const_get(group)
|
48
|
+
@@lookup[group.method] = group
|
49
|
+
group.presets.each do |x,y|
|
50
|
+
@@lookup[x] = group
|
51
|
+
end if group.presets
|
8
52
|
end
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
53
|
+
end
|
54
|
+
|
55
|
+
# Find the class of a group given its method.
|
56
|
+
def self.lookup(method)
|
57
|
+
@@lookup[method]
|
58
|
+
end
|
59
|
+
|
60
|
+
# Fix -- This must be moved urgently.
|
61
|
+
Treat::Entities::Entity.class_eval do
|
62
|
+
|
63
|
+
alias :true_language :language
|
64
|
+
|
65
|
+
def language(extractor = nil, options = {})
|
16
66
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
67
|
+
if is_a?(Treat::Entities::Symbol) ||
|
68
|
+
is_a?(Treat::Entities::Number)
|
69
|
+
return Treat.default_language
|
70
|
+
end
|
71
|
+
|
72
|
+
if !Treat.detect_language
|
73
|
+
return Treat.default_language
|
74
|
+
else
|
75
|
+
dlvl = Treat.language_detection_level
|
76
|
+
if (Treat::Entities.rank(type) <
|
77
|
+
Treat::Entities.rank(dlvl)) &&
|
78
|
+
has_parent?
|
79
|
+
anc = ancestor_with_type(dlvl)
|
80
|
+
return anc.language if anc
|
29
81
|
end
|
30
82
|
end
|
31
83
|
|
32
|
-
|
84
|
+
true_language(extractor, options)
|
85
|
+
|
33
86
|
end
|
34
|
-
|
35
|
-
require 'treat/category'
|
36
|
-
require 'treat/formatters'
|
37
|
-
require 'treat/processors'
|
38
|
-
require 'treat/lexicalizers'
|
39
|
-
require 'treat/inflectors'
|
40
|
-
require 'treat/extractors'
|
41
|
-
require 'treat/retrievers'
|
87
|
+
|
42
88
|
end
|
89
|
+
|
43
90
|
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# A categorizable module brings together groups
|
2
|
+
# of algorithms that perform similar functions.
|
3
|
+
module Treat::Categorizable
|
4
|
+
|
5
|
+
# The contents of each categorizable
|
6
|
+
# module are groupable.
|
7
|
+
require 'treat/groupable'
|
8
|
+
|
9
|
+
# Add workers to the Entities based on the
|
10
|
+
# configuration for a given category.
|
11
|
+
def self.extended(category)
|
12
|
+
Treat::Categories.list << category
|
13
|
+
category.module_eval do
|
14
|
+
groups.each do |group|
|
15
|
+
group = const_get(group)
|
16
|
+
group.targets.each do |entity_type|
|
17
|
+
entity = Treat::Entities.
|
18
|
+
const_get(cc(entity_type))
|
19
|
+
entity.class_eval do
|
20
|
+
add_workers group
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Get the list of groups defined
|
28
|
+
# under this module.
|
29
|
+
@@groups = self.constants
|
30
|
+
|
31
|
+
# Populate a list of methods.
|
32
|
+
@@methods = []
|
33
|
+
@@groups.each do |group|
|
34
|
+
@@methods << const_get(group).method
|
35
|
+
end
|
36
|
+
|
37
|
+
# Provide a list of methods implemented in
|
38
|
+
# the groups contained within this category.
|
39
|
+
def methods; @@methods; end
|
40
|
+
|
41
|
+
# Provides a list of groups within this category.
|
42
|
+
def groups; self.constants; end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
class Treat::Classification
|
2
|
+
|
3
|
+
attr_reader :types
|
4
|
+
attr_reader :features
|
5
|
+
attr_reader :question
|
6
|
+
attr_reader :labels
|
7
|
+
attr_reader :default
|
8
|
+
|
9
|
+
def initialize(type_or_types, feature_or_features, question, default = false)
|
10
|
+
|
11
|
+
@types, @features,
|
12
|
+
@question, @default =
|
13
|
+
[*type_or_types],
|
14
|
+
[*feature_or_features],
|
15
|
+
question, default
|
16
|
+
|
17
|
+
@labels = []
|
18
|
+
|
19
|
+
@features.each do |cmd|
|
20
|
+
if cmd.is_a?(Array)
|
21
|
+
@labels << cmd[0]
|
22
|
+
else
|
23
|
+
@labels << cmd
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
def export_item(e, include_question = true)
|
30
|
+
|
31
|
+
line = []
|
32
|
+
|
33
|
+
@features.each do |cmd|
|
34
|
+
begin
|
35
|
+
if cmd.is_a?(Array)
|
36
|
+
line << cmd[1].call(e)
|
37
|
+
else
|
38
|
+
line << e.send(cmd)
|
39
|
+
end
|
40
|
+
rescue Treat::Exception
|
41
|
+
dflt = (
|
42
|
+
(cmd.is_a?(Array) && cmd[2]) ?
|
43
|
+
cmd[2] : nil
|
44
|
+
)
|
45
|
+
line << dflt
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
begin
|
50
|
+
if include_question
|
51
|
+
line << e.send(@question)
|
52
|
+
end
|
53
|
+
rescue Treat::Exception
|
54
|
+
line << @default
|
55
|
+
end
|
56
|
+
line[-1] = '' if line[-1].nil?
|
57
|
+
line
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|