treat 1.0.6 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -4
- data/README.md +13 -12
- data/bin/MANIFEST +1 -0
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
- data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
- data/files/{INFO → MANIFEST} +0 -0
- data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
- data/files/weather-central-canada-heat-wave.html +1370 -0
- data/lib/treat/config/core/acronyms.rb +4 -0
- data/lib/treat/config/core/encodings.rb +8 -0
- data/lib/treat/config/core/entities.rb +2 -0
- data/lib/treat/config/core/language.rb +3 -0
- data/lib/treat/config/core/paths.rb +8 -0
- data/lib/treat/config/core/syntax.rb +1 -0
- data/lib/treat/config/core/verbosity.rb +1 -0
- data/lib/treat/config/databases/mongo.rb +3 -0
- data/lib/treat/config/languages/agnostic.rb +34 -0
- data/lib/treat/config/languages/arabic.rb +13 -0
- data/lib/treat/config/languages/chinese.rb +13 -0
- data/lib/treat/config/languages/dutch.rb +12 -0
- data/lib/treat/config/languages/english.rb +60 -0
- data/lib/treat/config/languages/french.rb +18 -0
- data/lib/treat/config/languages/german.rb +18 -0
- data/lib/treat/config/languages/greek.rb +12 -0
- data/lib/treat/config/languages/italian.rb +12 -0
- data/lib/treat/config/languages/polish.rb +12 -0
- data/lib/treat/config/languages/portuguese.rb +12 -0
- data/lib/treat/config/languages/russian.rb +12 -0
- data/lib/treat/config/languages/spanish.rb +12 -0
- data/lib/treat/config/languages/swedish.rb +12 -0
- data/lib/treat/config/libraries/stanford.rb +1 -0
- data/lib/treat/config/linguistics/categories.rb +4 -0
- data/lib/treat/config/linguistics/punctuation.rb +33 -0
- data/lib/treat/config/tags/aligned.rb +221 -0
- data/lib/treat/config/tags/enju.rb +71 -0
- data/lib/treat/config/tags/paris7.rb +17 -0
- data/lib/treat/config/tags/ptb.rb +15 -0
- data/lib/treat/config/workers/extractors.rb +39 -0
- data/lib/treat/config/workers/formatters.rb +20 -0
- data/lib/treat/config/workers/inflectors.rb +27 -0
- data/lib/treat/config/workers/learners.rb +6 -0
- data/lib/treat/config/workers/lexicalizers.rb +18 -0
- data/lib/treat/config/workers/list.rb +1 -0
- data/lib/treat/config/workers/processors.rb +19 -0
- data/lib/treat/config/workers/retrievers.rb +12 -0
- data/lib/treat/config.rb +125 -0
- data/lib/treat/{classification.rb → core/classification.rb} +1 -1
- data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
- data/lib/treat/{tree.rb → core/node.rb} +5 -5
- data/lib/treat/core/server.rb +3 -0
- data/lib/treat/core.rb +5 -0
- data/lib/treat/entities/abilities/buildable.rb +61 -56
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/comparable.rb +21 -0
- data/lib/treat/entities/abilities/copyable.rb +2 -0
- data/lib/treat/entities/abilities/countable.rb +1 -1
- data/lib/treat/entities/abilities/debuggable.rb +1 -1
- data/lib/treat/entities/abilities/delegatable.rb +42 -36
- data/lib/treat/entities/abilities/doable.rb +2 -2
- data/lib/treat/entities/abilities/exportable.rb +1 -1
- data/lib/treat/entities/abilities/iterable.rb +21 -33
- data/lib/treat/entities/abilities/magical.rb +8 -8
- data/lib/treat/entities/abilities/registrable.rb +0 -38
- data/lib/treat/entities/abilities/stringable.rb +19 -19
- data/lib/treat/entities/collection.rb +31 -0
- data/lib/treat/entities/document.rb +10 -0
- data/lib/treat/entities/entity.rb +18 -13
- data/lib/treat/entities/group.rb +15 -0
- data/lib/treat/entities/section.rb +13 -0
- data/lib/treat/entities/token.rb +35 -0
- data/lib/treat/entities/zone.rb +11 -0
- data/lib/treat/entities.rb +5 -75
- data/lib/treat/helpers/didyoumean.rb +57 -0
- data/lib/treat/helpers/escaping.rb +15 -0
- data/lib/treat/helpers/formatting.rb +41 -0
- data/lib/treat/helpers/platform.rb +15 -0
- data/lib/treat/helpers/reflection.rb +17 -0
- data/lib/treat/helpers/temporary.rb +27 -0
- data/lib/treat/helpers/verbosity.rb +19 -0
- data/lib/treat/helpers.rb +5 -0
- data/lib/treat/installer.rb +46 -165
- data/lib/treat/loaders/linguistics.rb +22 -27
- data/lib/treat/loaders/stanford.rb +23 -41
- data/lib/treat/loaders.rb +10 -0
- data/lib/treat/proxies.rb +73 -24
- data/lib/treat/version.rb +3 -0
- data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
- data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
- data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
- data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
- data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
- data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
- data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
- data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
- data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
- data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
- data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
- data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
- data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
- data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
- data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
- data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
- data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
- data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
- data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
- data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
- data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
- data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
- data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
- data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
- data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
- data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
- data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
- data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
- data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
- data/lib/treat/workers.rb +96 -0
- data/lib/treat.rb +23 -49
- data/spec/collection.rb +4 -4
- data/spec/document.rb +5 -5
- data/spec/entity.rb +33 -32
- data/spec/{tree.rb → node.rb} +5 -5
- data/spec/phrase.rb +5 -39
- data/spec/sandbox.rb +212 -6
- data/spec/token.rb +12 -9
- data/spec/treat.rb +12 -9
- data/spec/word.rb +10 -9
- data/spec/zone.rb +6 -2
- data/tmp/{INFO → MANIFEST} +0 -0
- data/tmp/english.yaml +10340 -0
- metadata +149 -139
- data/lib/treat/ai.rb +0 -12
- data/lib/treat/categories.rb +0 -90
- data/lib/treat/categorizable.rb +0 -44
- data/lib/treat/configurable.rb +0 -115
- data/lib/treat/dependencies.rb +0 -25
- data/lib/treat/downloader.rb +0 -87
- data/lib/treat/entities/abilities.rb +0 -10
- data/lib/treat/entities/entities.rb +0 -102
- data/lib/treat/exception.rb +0 -7
- data/lib/treat/extractors.rb +0 -79
- data/lib/treat/formatters/serializers/mongo.rb +0 -64
- data/lib/treat/formatters.rb +0 -41
- data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
- data/lib/treat/inflectors.rb +0 -52
- data/lib/treat/kernel.rb +0 -208
- data/lib/treat/languages/arabic.rb +0 -16
- data/lib/treat/languages/chinese.rb +0 -16
- data/lib/treat/languages/dutch.rb +0 -16
- data/lib/treat/languages/english.rb +0 -63
- data/lib/treat/languages/french.rb +0 -20
- data/lib/treat/languages/german.rb +0 -20
- data/lib/treat/languages/greek.rb +0 -16
- data/lib/treat/languages/italian.rb +0 -17
- data/lib/treat/languages/language.rb +0 -10
- data/lib/treat/languages/list.txt +0 -504
- data/lib/treat/languages/polish.rb +0 -16
- data/lib/treat/languages/portuguese.rb +0 -16
- data/lib/treat/languages/russian.rb +0 -16
- data/lib/treat/languages/spanish.rb +0 -16
- data/lib/treat/languages/swedish.rb +0 -16
- data/lib/treat/languages.rb +0 -132
- data/lib/treat/lexicalizers.rb +0 -37
- data/lib/treat/object.rb +0 -7
- data/lib/treat/processors/chunkers/autoselect.rb +0 -16
- data/lib/treat/processors/chunkers/txt.rb +0 -21
- data/lib/treat/processors.rb +0 -38
- data/lib/treat/retrievers.rb +0 -27
- data/lib/treat/server.rb +0 -26
- data/lib/treat/universalisation/encodings.rb +0 -12
- data/lib/treat/universalisation/tags.rb +0 -453
- data/lib/treat/universalisation.rb +0 -9
- data/spec/languages.rb +0 -25
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,16 +9,16 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-07-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
15
|
+
name: schiphol
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 0
|
21
|
+
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,23 +26,7 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: 0
|
30
|
-
- !ruby/object:Gem::Dependency
|
31
|
-
name: progressbar
|
32
|
-
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
|
-
requirements:
|
35
|
-
- - ! '>='
|
36
|
-
- !ruby/object:Gem::Version
|
37
|
-
version: 0.10.0
|
38
|
-
type: :runtime
|
39
|
-
prerelease: false
|
40
|
-
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
|
-
requirements:
|
43
|
-
- - ! '>='
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
version: 0.10.0
|
29
|
+
version: '0'
|
46
30
|
- !ruby/object:Gem::Dependency
|
47
31
|
name: rspec
|
48
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -50,7 +34,7 @@ dependencies:
|
|
50
34
|
requirements:
|
51
35
|
- - ! '>='
|
52
36
|
- !ruby/object:Gem::Version
|
53
|
-
version:
|
37
|
+
version: '0'
|
54
38
|
type: :development
|
55
39
|
prerelease: false
|
56
40
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -58,7 +42,7 @@ dependencies:
|
|
58
42
|
requirements:
|
59
43
|
- - ! '>='
|
60
44
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
45
|
+
version: '0'
|
62
46
|
- !ruby/object:Gem::Dependency
|
63
47
|
name: rake
|
64
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,7 +50,7 @@ dependencies:
|
|
66
50
|
requirements:
|
67
51
|
- - ! '>='
|
68
52
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0
|
53
|
+
version: '0'
|
70
54
|
type: :development
|
71
55
|
prerelease: false
|
72
56
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -74,26 +58,66 @@ dependencies:
|
|
74
58
|
requirements:
|
75
59
|
- - ! '>='
|
76
60
|
- !ruby/object:Gem::Version
|
77
|
-
version: 0
|
78
|
-
description: ! ' Treat is a
|
61
|
+
version: '0'
|
62
|
+
description: ! ' Treat is a natural language processing framework for Ruby. '
|
79
63
|
email:
|
80
64
|
- louis.mullie@gmail.com
|
81
65
|
executables: []
|
82
66
|
extensions: []
|
83
67
|
extra_rdoc_files: []
|
84
68
|
files:
|
85
|
-
-
|
86
|
-
-
|
87
|
-
-
|
88
|
-
-
|
89
|
-
-
|
90
|
-
-
|
91
|
-
- lib/treat/
|
92
|
-
- lib/treat/
|
93
|
-
- lib/treat/
|
94
|
-
- lib/treat/
|
69
|
+
- bin/MANIFEST
|
70
|
+
- bin/stanford/bridge.jar
|
71
|
+
- bin/stanford/joda-time.jar
|
72
|
+
- bin/stanford/stanford-corenlp.jar
|
73
|
+
- bin/stanford/stanford-parser.jar
|
74
|
+
- bin/stanford/xom.jar
|
75
|
+
- lib/treat/config/core/acronyms.rb
|
76
|
+
- lib/treat/config/core/encodings.rb
|
77
|
+
- lib/treat/config/core/entities.rb
|
78
|
+
- lib/treat/config/core/language.rb
|
79
|
+
- lib/treat/config/core/paths.rb
|
80
|
+
- lib/treat/config/core/syntax.rb
|
81
|
+
- lib/treat/config/core/verbosity.rb
|
82
|
+
- lib/treat/config/databases/mongo.rb
|
83
|
+
- lib/treat/config/languages/agnostic.rb
|
84
|
+
- lib/treat/config/languages/arabic.rb
|
85
|
+
- lib/treat/config/languages/chinese.rb
|
86
|
+
- lib/treat/config/languages/dutch.rb
|
87
|
+
- lib/treat/config/languages/english.rb
|
88
|
+
- lib/treat/config/languages/french.rb
|
89
|
+
- lib/treat/config/languages/german.rb
|
90
|
+
- lib/treat/config/languages/greek.rb
|
91
|
+
- lib/treat/config/languages/italian.rb
|
92
|
+
- lib/treat/config/languages/polish.rb
|
93
|
+
- lib/treat/config/languages/portuguese.rb
|
94
|
+
- lib/treat/config/languages/russian.rb
|
95
|
+
- lib/treat/config/languages/spanish.rb
|
96
|
+
- lib/treat/config/languages/swedish.rb
|
97
|
+
- lib/treat/config/libraries/stanford.rb
|
98
|
+
- lib/treat/config/linguistics/categories.rb
|
99
|
+
- lib/treat/config/linguistics/punctuation.rb
|
100
|
+
- lib/treat/config/tags/aligned.rb
|
101
|
+
- lib/treat/config/tags/enju.rb
|
102
|
+
- lib/treat/config/tags/paris7.rb
|
103
|
+
- lib/treat/config/tags/ptb.rb
|
104
|
+
- lib/treat/config/workers/extractors.rb
|
105
|
+
- lib/treat/config/workers/formatters.rb
|
106
|
+
- lib/treat/config/workers/inflectors.rb
|
107
|
+
- lib/treat/config/workers/learners.rb
|
108
|
+
- lib/treat/config/workers/lexicalizers.rb
|
109
|
+
- lib/treat/config/workers/list.rb
|
110
|
+
- lib/treat/config/workers/processors.rb
|
111
|
+
- lib/treat/config/workers/retrievers.rb
|
112
|
+
- lib/treat/config.rb
|
113
|
+
- lib/treat/core/classification.rb
|
114
|
+
- lib/treat/core/data_set.rb
|
115
|
+
- lib/treat/core/node.rb
|
116
|
+
- lib/treat/core/server.rb
|
117
|
+
- lib/treat/core.rb
|
95
118
|
- lib/treat/entities/abilities/buildable.rb
|
96
119
|
- lib/treat/entities/abilities/checkable.rb
|
120
|
+
- lib/treat/entities/abilities/comparable.rb
|
97
121
|
- lib/treat/entities/abilities/copyable.rb
|
98
122
|
- lib/treat/entities/abilities/countable.rb
|
99
123
|
- lib/treat/entities/abilities/debuggable.rb
|
@@ -104,109 +128,96 @@ files:
|
|
104
128
|
- lib/treat/entities/abilities/magical.rb
|
105
129
|
- lib/treat/entities/abilities/registrable.rb
|
106
130
|
- lib/treat/entities/abilities/stringable.rb
|
107
|
-
- lib/treat/entities/
|
108
|
-
- lib/treat/entities/
|
131
|
+
- lib/treat/entities/collection.rb
|
132
|
+
- lib/treat/entities/document.rb
|
109
133
|
- lib/treat/entities/entity.rb
|
134
|
+
- lib/treat/entities/group.rb
|
135
|
+
- lib/treat/entities/section.rb
|
136
|
+
- lib/treat/entities/token.rb
|
137
|
+
- lib/treat/entities/zone.rb
|
110
138
|
- lib/treat/entities.rb
|
111
|
-
- lib/treat/
|
112
|
-
- lib/treat/
|
113
|
-
- lib/treat/
|
114
|
-
- lib/treat/
|
115
|
-
- lib/treat/
|
116
|
-
- lib/treat/
|
117
|
-
- lib/treat/
|
118
|
-
- lib/treat/
|
119
|
-
- lib/treat/extractors/topic_words/lda.rb
|
120
|
-
- lib/treat/extractors/topics/reuters.rb
|
121
|
-
- lib/treat/extractors.rb
|
122
|
-
- lib/treat/formatters/readers/abw.rb
|
123
|
-
- lib/treat/formatters/readers/autoselect.rb
|
124
|
-
- lib/treat/formatters/readers/doc.rb
|
125
|
-
- lib/treat/formatters/readers/html.rb
|
126
|
-
- lib/treat/formatters/readers/image.rb
|
127
|
-
- lib/treat/formatters/readers/odt.rb
|
128
|
-
- lib/treat/formatters/readers/pdf.rb
|
129
|
-
- lib/treat/formatters/readers/txt.rb
|
130
|
-
- lib/treat/formatters/readers/xml.rb
|
131
|
-
- lib/treat/formatters/serializers/mongo.rb
|
132
|
-
- lib/treat/formatters/serializers/xml.rb
|
133
|
-
- lib/treat/formatters/serializers/yaml.rb
|
134
|
-
- lib/treat/formatters/unserializers/autoselect.rb
|
135
|
-
- lib/treat/formatters/unserializers/xml.rb
|
136
|
-
- lib/treat/formatters/unserializers/yaml.rb
|
137
|
-
- lib/treat/formatters/visualizers/dot.rb
|
138
|
-
- lib/treat/formatters/visualizers/standoff.rb
|
139
|
-
- lib/treat/formatters/visualizers/tree.rb
|
140
|
-
- lib/treat/formatters.rb
|
141
|
-
- lib/treat/groupable.rb
|
142
|
-
- lib/treat/helpers/decimal_point_escaper.rb
|
143
|
-
- lib/treat/inflectors/cardinalizers/linguistics.rb
|
144
|
-
- lib/treat/inflectors/conjugators/linguistics.rb
|
145
|
-
- lib/treat/inflectors/declensors/active_support.rb
|
146
|
-
- lib/treat/inflectors/declensors/english/inflect.rb
|
147
|
-
- lib/treat/inflectors/declensors/english.rb
|
148
|
-
- lib/treat/inflectors/declensors/linguistics.rb
|
149
|
-
- lib/treat/inflectors/ordinalizers/linguistics.rb
|
150
|
-
- lib/treat/inflectors/stemmers/porter.rb
|
151
|
-
- lib/treat/inflectors/stemmers/porter_c.rb
|
152
|
-
- lib/treat/inflectors/stemmers/uea.rb
|
153
|
-
- lib/treat/inflectors.rb
|
139
|
+
- lib/treat/helpers/didyoumean.rb
|
140
|
+
- lib/treat/helpers/escaping.rb
|
141
|
+
- lib/treat/helpers/formatting.rb
|
142
|
+
- lib/treat/helpers/platform.rb
|
143
|
+
- lib/treat/helpers/reflection.rb
|
144
|
+
- lib/treat/helpers/temporary.rb
|
145
|
+
- lib/treat/helpers/verbosity.rb
|
146
|
+
- lib/treat/helpers.rb
|
154
147
|
- lib/treat/installer.rb
|
155
|
-
- lib/treat/kernel.rb
|
156
|
-
- lib/treat/languages/arabic.rb
|
157
|
-
- lib/treat/languages/chinese.rb
|
158
|
-
- lib/treat/languages/dutch.rb
|
159
|
-
- lib/treat/languages/english.rb
|
160
|
-
- lib/treat/languages/french.rb
|
161
|
-
- lib/treat/languages/german.rb
|
162
|
-
- lib/treat/languages/greek.rb
|
163
|
-
- lib/treat/languages/italian.rb
|
164
|
-
- lib/treat/languages/language.rb
|
165
|
-
- lib/treat/languages/list.txt
|
166
|
-
- lib/treat/languages/polish.rb
|
167
|
-
- lib/treat/languages/portuguese.rb
|
168
|
-
- lib/treat/languages/russian.rb
|
169
|
-
- lib/treat/languages/spanish.rb
|
170
|
-
- lib/treat/languages/swedish.rb
|
171
|
-
- lib/treat/languages.rb
|
172
|
-
- lib/treat/lexicalizers/categorizers/from_tag.rb
|
173
|
-
- lib/treat/lexicalizers/sensers/wordnet/synset.rb
|
174
|
-
- lib/treat/lexicalizers/sensers/wordnet.rb
|
175
|
-
- lib/treat/lexicalizers/taggers/brill/patch.rb
|
176
|
-
- lib/treat/lexicalizers/taggers/brill.rb
|
177
|
-
- lib/treat/lexicalizers/taggers/lingua.rb
|
178
|
-
- lib/treat/lexicalizers/taggers/stanford.rb
|
179
|
-
- lib/treat/lexicalizers.rb
|
180
148
|
- lib/treat/loaders/linguistics.rb
|
181
149
|
- lib/treat/loaders/stanford.rb
|
182
|
-
- lib/treat/
|
183
|
-
- lib/treat/processors/chunkers/autoselect.rb
|
184
|
-
- lib/treat/processors/chunkers/html.rb
|
185
|
-
- lib/treat/processors/chunkers/txt.rb
|
186
|
-
- lib/treat/processors/parsers/enju.rb
|
187
|
-
- lib/treat/processors/parsers/stanford.rb
|
188
|
-
- lib/treat/processors/segmenters/punkt.rb
|
189
|
-
- lib/treat/processors/segmenters/stanford.rb
|
190
|
-
- lib/treat/processors/segmenters/tactful.rb
|
191
|
-
- lib/treat/processors/tokenizers/ptb.rb
|
192
|
-
- lib/treat/processors/tokenizers/punkt.rb
|
193
|
-
- lib/treat/processors/tokenizers/stanford.rb
|
194
|
-
- lib/treat/processors/tokenizers/tactful.rb
|
195
|
-
- lib/treat/processors.rb
|
150
|
+
- lib/treat/loaders.rb
|
196
151
|
- lib/treat/proxies.rb
|
197
|
-
- lib/treat/
|
198
|
-
- lib/treat/
|
199
|
-
- lib/treat/
|
200
|
-
- lib/treat/
|
201
|
-
- lib/treat/
|
202
|
-
- lib/treat/
|
203
|
-
- lib/treat/
|
204
|
-
- lib/treat/
|
152
|
+
- lib/treat/version.rb
|
153
|
+
- lib/treat/workers/extractors/keywords/tf_idf.rb
|
154
|
+
- lib/treat/workers/extractors/language/what_language.rb
|
155
|
+
- lib/treat/workers/extractors/name_tag/stanford.rb
|
156
|
+
- lib/treat/workers/extractors/tf_idf/native.rb
|
157
|
+
- lib/treat/workers/extractors/time/chronic.rb
|
158
|
+
- lib/treat/workers/extractors/time/nickel.rb
|
159
|
+
- lib/treat/workers/extractors/time/ruby.rb
|
160
|
+
- lib/treat/workers/extractors/topic_words/lda.rb
|
161
|
+
- lib/treat/workers/extractors/topics/reuters.rb
|
162
|
+
- lib/treat/workers/formatters/readers/abw.rb
|
163
|
+
- lib/treat/workers/formatters/readers/autoselect.rb
|
164
|
+
- lib/treat/workers/formatters/readers/doc.rb
|
165
|
+
- lib/treat/workers/formatters/readers/html.rb
|
166
|
+
- lib/treat/workers/formatters/readers/image.rb
|
167
|
+
- lib/treat/workers/formatters/readers/odt.rb
|
168
|
+
- lib/treat/workers/formatters/readers/pdf.rb
|
169
|
+
- lib/treat/workers/formatters/readers/txt.rb
|
170
|
+
- lib/treat/workers/formatters/readers/xml.rb
|
171
|
+
- lib/treat/workers/formatters/serializers/mongo.rb
|
172
|
+
- lib/treat/workers/formatters/serializers/xml.rb
|
173
|
+
- lib/treat/workers/formatters/serializers/yaml.rb
|
174
|
+
- lib/treat/workers/formatters/unserializers/autoselect.rb
|
175
|
+
- lib/treat/workers/formatters/unserializers/mongo.rb
|
176
|
+
- lib/treat/workers/formatters/unserializers/xml.rb
|
177
|
+
- lib/treat/workers/formatters/unserializers/yaml.rb
|
178
|
+
- lib/treat/workers/formatters/visualizers/dot.rb
|
179
|
+
- lib/treat/workers/formatters/visualizers/standoff.rb
|
180
|
+
- lib/treat/workers/formatters/visualizers/tree.rb
|
181
|
+
- lib/treat/workers/group.rb
|
182
|
+
- lib/treat/workers/inflectors/cardinalizers/linguistics.rb
|
183
|
+
- lib/treat/workers/inflectors/conjugators/linguistics.rb
|
184
|
+
- lib/treat/workers/inflectors/declensors/active_support.rb
|
185
|
+
- lib/treat/workers/inflectors/declensors/english/inflect.rb
|
186
|
+
- lib/treat/workers/inflectors/declensors/english.rb
|
187
|
+
- lib/treat/workers/inflectors/declensors/linguistics.rb
|
188
|
+
- lib/treat/workers/inflectors/ordinalizers/linguistics.rb
|
189
|
+
- lib/treat/workers/inflectors/stemmers/porter.rb
|
190
|
+
- lib/treat/workers/inflectors/stemmers/porter_c.rb
|
191
|
+
- lib/treat/workers/inflectors/stemmers/uea.rb
|
192
|
+
- lib/treat/workers/learners/classifiers/id3.rb
|
193
|
+
- lib/treat/workers/learners/classifiers/mlp.rb
|
194
|
+
- lib/treat/workers/lexicalizers/categorizers/from_tag.rb
|
195
|
+
- lib/treat/workers/lexicalizers/sensers/wordnet/synset.rb
|
196
|
+
- lib/treat/workers/lexicalizers/sensers/wordnet.rb
|
197
|
+
- lib/treat/workers/lexicalizers/taggers/brill/patch.rb
|
198
|
+
- lib/treat/workers/lexicalizers/taggers/brill.rb
|
199
|
+
- lib/treat/workers/lexicalizers/taggers/lingua.rb
|
200
|
+
- lib/treat/workers/lexicalizers/taggers/stanford.rb
|
201
|
+
- lib/treat/workers/processors/chunkers/autoselect.rb
|
202
|
+
- lib/treat/workers/processors/chunkers/html.rb
|
203
|
+
- lib/treat/workers/processors/chunkers/txt.rb
|
204
|
+
- lib/treat/workers/processors/parsers/enju.rb
|
205
|
+
- lib/treat/workers/processors/parsers/stanford.rb
|
206
|
+
- lib/treat/workers/processors/segmenters/punkt.rb
|
207
|
+
- lib/treat/workers/processors/segmenters/stanford.rb
|
208
|
+
- lib/treat/workers/processors/segmenters/tactful.rb
|
209
|
+
- lib/treat/workers/processors/tokenizers/ptb.rb
|
210
|
+
- lib/treat/workers/processors/tokenizers/punkt.rb
|
211
|
+
- lib/treat/workers/processors/tokenizers/stanford.rb
|
212
|
+
- lib/treat/workers/processors/tokenizers/tactful.rb
|
213
|
+
- lib/treat/workers/retrievers/indexers/ferret.rb
|
214
|
+
- lib/treat/workers/retrievers/searchers/ferret.rb
|
215
|
+
- lib/treat/workers.rb
|
205
216
|
- lib/treat.rb
|
206
217
|
- spec/collection.rb
|
207
218
|
- spec/document.rb
|
208
219
|
- spec/entity.rb
|
209
|
-
- spec/
|
220
|
+
- spec/node.rb
|
210
221
|
- spec/phrase.rb
|
211
222
|
- spec/samples/mathematicians/archimedes.abw
|
212
223
|
- spec/samples/mathematicians/euler.html
|
@@ -216,21 +227,20 @@ files:
|
|
216
227
|
- spec/sandbox.rb
|
217
228
|
- spec/token.rb
|
218
229
|
- spec/treat.rb
|
219
|
-
- spec/tree.rb
|
220
230
|
- spec/word.rb
|
221
231
|
- spec/zone.rb
|
222
|
-
- tmp/
|
223
|
-
-
|
224
|
-
- files/
|
225
|
-
- files/
|
226
|
-
- files/
|
232
|
+
- tmp/english.yaml
|
233
|
+
- tmp/MANIFEST
|
234
|
+
- files/21552208.html
|
235
|
+
- files/3_2_release_notes.html
|
236
|
+
- files/MANIFEST
|
237
|
+
- files/nethttp-cheat-sheet-2940.html
|
238
|
+
- files/weather-central-canada-heat-wave.html
|
227
239
|
- README.md
|
228
240
|
- LICENSE
|
229
241
|
homepage: https://github.com/louismullie/treat
|
230
242
|
licenses: []
|
231
|
-
post_install_message:
|
232
|
-
you for installing Treat!\n\nComplete the installation by running:\n\n require
|
233
|
-
'treat'\n Treat.install\n\ninside IRB or a Ruby script.\n\n********************************************************************************\n\n"
|
243
|
+
post_install_message:
|
234
244
|
rdoc_options: []
|
235
245
|
require_paths:
|
236
246
|
- lib
|
@@ -248,7 +258,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
248
258
|
version: '0'
|
249
259
|
requirements: []
|
250
260
|
rubyforge_project:
|
251
|
-
rubygems_version: 1.8.
|
261
|
+
rubygems_version: 1.8.24
|
252
262
|
signing_key:
|
253
263
|
specification_version: 3
|
254
264
|
summary: Text Retrieval, Extraction and Annotation Toolkit.
|
data/lib/treat/ai.rb
DELETED
data/lib/treat/categories.rb
DELETED
@@ -1,90 +0,0 @@
|
|
1
|
-
# This module keeps track of all the Treat::Categorizable
|
2
|
-
# modules that exist and the methods they define.
|
3
|
-
#
|
4
|
-
#
|
5
|
-
# - Processors perform the building of tree of
|
6
|
-
# entities representing texts (chunking,
|
7
|
-
# segmenting, tokenizing, parsing).
|
8
|
-
# - Lexicalizers give lexical information about
|
9
|
-
# words (synsets, semantic relationships,
|
10
|
-
# tag, word category).
|
11
|
-
# - Extractors extract semantic information about
|
12
|
-
# an entity (language, topic, date, time, named
|
13
|
-
# entity, coreferences).
|
14
|
-
# - Inflectors allow to retrieve the different
|
15
|
-
# inflections of a word (declensors, conjugators,
|
16
|
-
# stemmers, lemmatizers).
|
17
|
-
# - Formatters handle the conversion of entities to
|
18
|
-
# and from different formats(readers, serializers,
|
19
|
-
# unserializers, visualizers).
|
20
|
-
# - Retrievers allow to index and search collections
|
21
|
-
# of documents.
|
22
|
-
module Treat::Categories
|
23
|
-
|
24
|
-
class << self
|
25
|
-
# A list of all categories.
|
26
|
-
attr_accessor :list
|
27
|
-
end
|
28
|
-
|
29
|
-
# Array - list of all categories.
|
30
|
-
self.list = []
|
31
|
-
# A lookup table for entity types.
|
32
|
-
@@lookup = {}
|
33
|
-
|
34
|
-
# Require all categories.
|
35
|
-
require 'treat/categorizable'
|
36
|
-
require 'treat/formatters'
|
37
|
-
require 'treat/processors'
|
38
|
-
require 'treat/lexicalizers'
|
39
|
-
require 'treat/inflectors'
|
40
|
-
require 'treat/extractors'
|
41
|
-
require 'treat/retrievers'
|
42
|
-
require 'treat/ai'
|
43
|
-
|
44
|
-
# Create the lookup table.
|
45
|
-
self.list.each do |category|
|
46
|
-
category.groups.each do |group|
|
47
|
-
group = category.const_get(group)
|
48
|
-
@@lookup[group.method] = group
|
49
|
-
group.presets.each do |x,y|
|
50
|
-
@@lookup[x] = group
|
51
|
-
end if group.presets
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
# Find the class of a group given its method.
|
56
|
-
def self.lookup(method)
|
57
|
-
@@lookup[method]
|
58
|
-
end
|
59
|
-
|
60
|
-
# Fix -- This must be moved urgently.
|
61
|
-
Treat::Entities::Entity.class_eval do
|
62
|
-
|
63
|
-
alias :true_language :language
|
64
|
-
|
65
|
-
def language(extractor = nil, options = {})
|
66
|
-
|
67
|
-
if is_a?(Treat::Entities::Symbol) ||
|
68
|
-
is_a?(Treat::Entities::Number)
|
69
|
-
return Treat.default_language
|
70
|
-
end
|
71
|
-
|
72
|
-
if !Treat.detect_language
|
73
|
-
return Treat.default_language
|
74
|
-
else
|
75
|
-
dlvl = Treat.language_detection_level
|
76
|
-
if (Treat::Entities.rank(type) <
|
77
|
-
Treat::Entities.rank(dlvl)) &&
|
78
|
-
has_parent?
|
79
|
-
anc = ancestor_with_type(dlvl)
|
80
|
-
return anc.language if anc
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
true_language(extractor, options)
|
85
|
-
|
86
|
-
end
|
87
|
-
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
data/lib/treat/categorizable.rb
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
# A categorizable module brings together groups
|
2
|
-
# of algorithms that perform similar functions.
|
3
|
-
module Treat::Categorizable
|
4
|
-
|
5
|
-
# The contents of each categorizable
|
6
|
-
# module are groupable.
|
7
|
-
require 'treat/groupable'
|
8
|
-
|
9
|
-
# Add workers to the Entities based on the
|
10
|
-
# configuration for a given category.
|
11
|
-
def self.extended(category)
|
12
|
-
Treat::Categories.list << category
|
13
|
-
category.module_eval do
|
14
|
-
groups.each do |group|
|
15
|
-
group = const_get(group)
|
16
|
-
group.targets.each do |entity_type|
|
17
|
-
entity = Treat::Entities.
|
18
|
-
const_get(cc(entity_type))
|
19
|
-
entity.class_eval do
|
20
|
-
add_workers group
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# Get the list of groups defined
|
28
|
-
# under this module.
|
29
|
-
@@groups = self.constants
|
30
|
-
|
31
|
-
# Populate a list of methods.
|
32
|
-
@@methods = []
|
33
|
-
@@groups.each do |group|
|
34
|
-
@@methods << const_get(group).method
|
35
|
-
end
|
36
|
-
|
37
|
-
# Provide a list of methods implemented in
|
38
|
-
# the groups contained within this category.
|
39
|
-
def methods; @@methods; end
|
40
|
-
|
41
|
-
# Provides a list of groups within this category.
|
42
|
-
def groups; self.constants; end
|
43
|
-
|
44
|
-
end
|
data/lib/treat/configurable.rb
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
# This module provides configuration options for the Treat toolkit
|
2
|
-
# (enable/disable syntactic sugar, enable/disable language detection
|
3
|
-
# and set default language or language detection level.
|
4
|
-
module Treat::Configurable
|
5
|
-
|
6
|
-
# Modify the singleton class of the base module (Treat).
|
7
|
-
def self.extended(base)
|
8
|
-
|
9
|
-
# Configuration options that are available for the Treat module.
|
10
|
-
class << base
|
11
|
-
# Symbol - default language to use when detect_language is false.
|
12
|
-
attr_accessor :default_language
|
13
|
-
# Boolean - detect language or use default?
|
14
|
-
attr_accessor :detect_language
|
15
|
-
# Symbol - the finest entity level at which to detect language.
|
16
|
-
attr_accessor :language_detection_level
|
17
|
-
# Boolean - whether to output debug information or not.
|
18
|
-
attr_accessor :debug
|
19
|
-
# Boolean - whether to silence the output of external programs.
|
20
|
-
attr_accessor :silence
|
21
|
-
end
|
22
|
-
|
23
|
-
# Set the default options.
|
24
|
-
base.module_eval do
|
25
|
-
# Set the default language to english.
|
26
|
-
self.default_language = :eng
|
27
|
-
# Turn language detection off by default.
|
28
|
-
self.detect_language = false
|
29
|
-
# Detect the language once per document by default.
|
30
|
-
self.language_detection_level = :document
|
31
|
-
# Set debug to off by default.
|
32
|
-
self.debug = false
|
33
|
-
# Silence external programs by default.
|
34
|
-
self.silence = true
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
# Turn on syntactic sugar for the creation of Entities.
|
40
|
-
#
|
41
|
-
# All entities found under Treat::Entities will be made
|
42
|
-
# available within the global namespace. As an example,
|
43
|
-
# 'Treat::Entities::Word' can then be referred to as 'Word'.
|
44
|
-
#
|
45
|
-
# There is one exception: the Symbol class is not sweetened
|
46
|
-
# to avoid clashing with the Symbol class defined by Ruby.
|
47
|
-
def sweeten!
|
48
|
-
return if @@sweetened
|
49
|
-
@@sweetened = true
|
50
|
-
each_entity_class do |type, klass|
|
51
|
-
Object.class_eval do
|
52
|
-
unless type == :Symbol
|
53
|
-
define_method(type) do |file_or_value, options={}|
|
54
|
-
klass.build(file_or_value, options)
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
# Turn off syntactic sugar.
|
62
|
-
def unsweeten!
|
63
|
-
return unless @@sweetened
|
64
|
-
@@sweetened = false
|
65
|
-
each_entity_class do |type, klass|
|
66
|
-
Object.class_eval do
|
67
|
-
remove_method(type)
|
68
|
-
end unless type == :Symbol
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
# Boolean - whether syntactic sugar is
|
73
|
-
# enabled or not.
|
74
|
-
def sweetened?; @@sweetened; end
|
75
|
-
|
76
|
-
# Syntactic sugar is disabled by default.
|
77
|
-
@@sweetened = false
|
78
|
-
|
79
|
-
# Turn on language detection, optionally setting
|
80
|
-
# the language detection level (finest level at
|
81
|
-
# which language detection is performed).
|
82
|
-
def self.detect!(level = nil)
|
83
|
-
self.detect_language = true
|
84
|
-
if level
|
85
|
-
self.language_detection_level = level
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
# Turn off language detection, optionally setting
|
90
|
-
# a new default language to use.
|
91
|
-
def self.undetect!(default = :english)
|
92
|
-
self.detect_language = false
|
93
|
-
if default
|
94
|
-
self.default_language = default
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
# Use the supplied language by default and
|
99
|
-
# turn off language detection.
|
100
|
-
def self.use(language)
|
101
|
-
self.detect_language = false
|
102
|
-
self.default_language = language
|
103
|
-
end
|
104
|
-
|
105
|
-
private
|
106
|
-
# Helper method, yields each entity type and class.
|
107
|
-
def each_entity_class
|
108
|
-
Treat::Entities.list.each do |entity_type|
|
109
|
-
type = cc(entity_type).intern
|
110
|
-
klass = Treat::Entities.const_get(type, klass)
|
111
|
-
yield type, klass
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
end
|
data/lib/treat/dependencies.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
class Treat::Dependencies
|
2
|
-
|
3
|
-
Gem = [
|
4
|
-
['psych', '>= 1.2.2', '(un)serialize annotated entities to YAML format'],
|
5
|
-
['nokogiri', '>= 1.5.2', 'read and parse XML and HTML formats'],
|
6
|
-
['sdsykes-ferret', '>= 0.11.6.19', 'perform full-text search in collections'],
|
7
|
-
['lda-ruby', '>= 0.3.8', 'extract topic words from documents and collections'],
|
8
|
-
['ruby-readability', '>= 0.5.2', 'extract the readable content from HTML pages'],
|
9
|
-
['stanford-core-nlp', '>= 0.3.0', 'tokenize, segment, parse texts and perform named entity recognition'],
|
10
|
-
['whatlanguage', '>= 1.0.0', 'detect the language of text'],
|
11
|
-
['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
|
12
|
-
['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
|
13
|
-
['chronic', '>= 0.6.7', 'detect date and time in text'],
|
14
|
-
['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities'],
|
15
|
-
['ai4r', '>= 1.11', 'perform different kinds of classification tasks on text entities']
|
16
|
-
]
|
17
|
-
|
18
|
-
Binary = [
|
19
|
-
['ocropus', 'recognize text in image files'],
|
20
|
-
['antiword', 'extract text from DOC files'],
|
21
|
-
['poppler-utils', 'extract text from PDF files'],
|
22
|
-
['graphviz', 'export and visualize directed graphs']
|
23
|
-
]
|
24
|
-
|
25
|
-
end
|