treat 1.2.0 → 2.0.0rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -2
- data/README.md +12 -21
- data/lib/treat/autoload.rb +44 -0
- data/lib/treat/config/config.rb +38 -0
- data/lib/treat/config/configurable.rb +51 -0
- data/lib/treat/config/data/config.rb +50 -0
- data/lib/treat/config/data/core.rb +52 -0
- data/lib/treat/config/data/databases.rb +10 -0
- data/lib/treat/config/data/entities.rb +15 -0
- data/lib/treat/config/data/languages/agnostic.rb +31 -0
- data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
- data/lib/treat/config/data/languages/english.rb +95 -0
- data/lib/treat/config/data/languages/french.rb +148 -0
- data/lib/treat/config/data/languages/german.rb +135 -0
- data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
- data/lib/treat/config/data/languages/italian.rb +162 -0
- data/lib/treat/config/data/languages/polish.rb +11 -0
- data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
- data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
- data/lib/treat/config/data/languages/spanish.rb +291 -0
- data/lib/treat/config/data/languages/swedish.rb +289 -0
- data/lib/treat/config/data/libraries.rb +12 -0
- data/lib/treat/config/data/linguistics.rb +44 -0
- data/lib/treat/config/data/tags.rb +328 -0
- data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
- data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
- data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
- data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
- data/lib/treat/config/importable.rb +31 -0
- data/lib/treat/config/paths.rb +23 -0
- data/lib/treat/config/tags.rb +37 -0
- data/lib/treat/core/dsl.rb +55 -0
- data/lib/treat/{installer.rb → core/installer.rb} +10 -12
- data/lib/treat/core/server.rb +40 -0
- data/lib/treat/entities/entities.rb +101 -0
- data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
- data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
- data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
- data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
- data/lib/treat/entities/entity/debuggable.rb +86 -0
- data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
- data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
- data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
- data/lib/treat/entities/entity/registrable.rb +36 -0
- data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
- data/lib/treat/entities/entity.rb +86 -77
- data/lib/treat/exception.rb +3 -0
- data/lib/treat/helpers/hash.rb +29 -0
- data/lib/treat/helpers/help.rb +35 -0
- data/lib/treat/helpers/object.rb +55 -0
- data/lib/treat/helpers/string.rb +124 -0
- data/lib/treat/{core → learning}/data_set.rb +11 -11
- data/lib/treat/{core → learning}/export.rb +3 -3
- data/lib/treat/{core → learning}/problem.rb +26 -16
- data/lib/treat/{core → learning}/question.rb +5 -9
- data/lib/treat/loaders/linguistics.rb +8 -9
- data/lib/treat/loaders/stanford.rb +5 -11
- data/lib/treat/modules.rb +33 -0
- data/lib/treat/proxies/array.rb +27 -0
- data/lib/treat/proxies/language.rb +47 -0
- data/lib/treat/proxies/number.rb +18 -0
- data/lib/treat/proxies/proxy.rb +25 -0
- data/lib/treat/proxies/string.rb +18 -0
- data/lib/treat/version.rb +10 -1
- data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
- data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
- data/lib/treat/workers/extractors/language/what_language.rb +8 -6
- data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
- data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
- data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
- data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
- data/lib/treat/workers/extractors/time/chronic.rb +2 -4
- data/lib/treat/workers/extractors/time/nickel.rb +19 -20
- data/lib/treat/workers/extractors/time/ruby.rb +2 -1
- data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
- data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
- data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
- data/lib/treat/workers/formatters/readers/image.rb +19 -9
- data/lib/treat/workers/formatters/readers/odt.rb +2 -1
- data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
- data/lib/treat/workers/formatters/readers/xml.rb +0 -1
- data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
- data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
- data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
- data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
- data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
- data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
- data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
- data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
- data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
- data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
- data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
- data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
- data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
- data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
- data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
- data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
- data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
- data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
- data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
- data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
- data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
- data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
- data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
- data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
- data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
- data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
- data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
- data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
- data/lib/treat/workers/processors/chunkers/html.rb +1 -6
- data/lib/treat/workers/processors/parsers/enju.rb +2 -4
- data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
- data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
- data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
- data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
- data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
- data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
- data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
- data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
- data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
- data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
- data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
- data/lib/treat/workers/workers.rb +6 -0
- data/lib/treat.rb +18 -32
- data/models/MANIFEST +1 -0
- data/spec/core/data_set.rb +174 -0
- data/spec/core/export.rb +52 -0
- data/spec/core/problem.rb +144 -0
- data/spec/core/question.rb +52 -0
- data/spec/{collection.rb → entities/collection.rb} +20 -35
- data/spec/{document.rb → entities/document.rb} +3 -54
- data/spec/{entity.rb → entities/entity.rb} +10 -9
- data/spec/entities/phrase.rb +33 -0
- data/spec/{token.rb → entities/token.rb} +0 -57
- data/spec/entities/word.rb +3 -0
- data/spec/{zone.rb → entities/zone.rb} +0 -26
- data/spec/helper.rb +116 -32
- data/spec/sandbox.rb +258 -25
- data/spec/treat.rb +26 -34
- data/spec/workers/agnostic.rb +137 -0
- data/spec/workers/english.rb +194 -0
- data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
- data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
- data/spec/workers/examples/english/phrase.xml +5 -0
- data/spec/workers/examples/english/test.txt +1 -0
- data/spec/workers/language.rb +280 -0
- data/spec/workers.rb +28 -0
- metadata +122 -105
- data/lib/treat/config/core/acronyms.rb +0 -5
- data/lib/treat/config/core/encodings.rb +0 -8
- data/lib/treat/config/core/entities.rb +0 -2
- data/lib/treat/config/core/language.rb +0 -3
- data/lib/treat/config/core/paths.rb +0 -8
- data/lib/treat/config/core/syntax.rb +0 -1
- data/lib/treat/config/core/verbosity.rb +0 -1
- data/lib/treat/config/databases/default.rb +0 -1
- data/lib/treat/config/databases/mongo.rb +0 -1
- data/lib/treat/config/languages/agnostic.rb +0 -34
- data/lib/treat/config/languages/english.rb +0 -60
- data/lib/treat/config/languages/french.rb +0 -18
- data/lib/treat/config/languages/german.rb +0 -18
- data/lib/treat/config/languages/italian.rb +0 -12
- data/lib/treat/config/languages/polish.rb +0 -12
- data/lib/treat/config/languages/spanish.rb +0 -12
- data/lib/treat/config/languages/swedish.rb +0 -12
- data/lib/treat/config/libraries/punkt.rb +0 -1
- data/lib/treat/config/libraries/reuters.rb +0 -1
- data/lib/treat/config/libraries/stanford.rb +0 -1
- data/lib/treat/config/linguistics/categories.rb +0 -4
- data/lib/treat/config/linguistics/punctuation.rb +0 -33
- data/lib/treat/config/tags/aligned.rb +0 -221
- data/lib/treat/config/tags/enju.rb +0 -71
- data/lib/treat/config/tags/paris7.rb +0 -17
- data/lib/treat/config/tags/ptb.rb +0 -15
- data/lib/treat/config/workers/list.rb +0 -1
- data/lib/treat/config.rb +0 -135
- data/lib/treat/core.rb +0 -5
- data/lib/treat/entities/abilities/copyable.rb +0 -47
- data/lib/treat/entities/abilities/debuggable.rb +0 -83
- data/lib/treat/entities/abilities/registrable.rb +0 -46
- data/lib/treat/entities/collection.rb +0 -40
- data/lib/treat/entities/document.rb +0 -10
- data/lib/treat/entities/group.rb +0 -18
- data/lib/treat/entities/section.rb +0 -13
- data/lib/treat/entities/token.rb +0 -47
- data/lib/treat/entities/zone.rb +0 -12
- data/lib/treat/entities.rb +0 -6
- data/lib/treat/helpers/didyoumean.rb +0 -57
- data/lib/treat/helpers/escaping.rb +0 -15
- data/lib/treat/helpers/formatting.rb +0 -41
- data/lib/treat/helpers/objtohash.rb +0 -8
- data/lib/treat/helpers/platform.rb +0 -15
- data/lib/treat/helpers/reflection.rb +0 -17
- data/lib/treat/helpers/temporary.rb +0 -27
- data/lib/treat/helpers/verbosity.rb +0 -19
- data/lib/treat/helpers.rb +0 -5
- data/lib/treat/loaders.rb +0 -10
- data/lib/treat/proxies.rb +0 -106
- data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
- data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
- data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
- data/spec/core.rb +0 -441
- data/spec/phrase.rb +0 -112
- data/spec/word.rb +0 -111
data/spec/sandbox.rb
CHANGED
@@ -1,36 +1,269 @@
|
|
1
|
-
|
1
|
+
# encoding: utf-8
|
2
|
+
require_relative '../lib/treat'
|
3
|
+
|
4
|
+
require 'treat'
|
5
|
+
include Treat::Core::DSL
|
6
|
+
|
7
|
+
Treat.libraries.stanford.jar_path = '/ruby/treat/bin/'
|
8
|
+
Treat.libraries.stanford.model_path = '/ruby/treat/models/'
|
9
|
+
|
10
|
+
p = paragraph
|
11
|
+
s = sentence
|
12
|
+
w = word
|
13
|
+
|
14
|
+
p = phrase 'hello world'
|
15
|
+
e = email 'louis@gmail.com'
|
16
|
+
|
17
|
+
#d = document Treat.paths.spec + 'workers/examples/english/economist/hungarys_troubles.txt'
|
18
|
+
#d.apply :chunk, :segment, :tokenize, :tag, :category, :name_tag
|
19
|
+
#d.print_tree
|
20
|
+
#d = document Treat.paths.spec + 'workers/examples/english/economist/saving_the_euro.odt'
|
21
|
+
#d.print_tree
|
22
|
+
=begin
|
23
|
+
d = document 'test.htm'
|
24
|
+
d.apply :chunk
|
25
|
+
#d.serialize :yaml, file: 'test444.yaml'
|
26
|
+
d.set :test, 2
|
27
|
+
d.serialize :mongo, db: 'test'
|
28
|
+
d.set :test, 3
|
29
|
+
d.serialize :mongo, db: 'test'
|
30
|
+
d.apply :segment, :tokenize, :tag, :category
|
31
|
+
puts d.verb_count
|
32
|
+
#d2 = document id: d.id, db: 'test'
|
33
|
+
d2 = document 'features.test' => 3, db: 'test'
|
34
|
+
d2.apply :segment, :tokenize, :tag, :category
|
35
|
+
puts d2.verb_count
|
36
|
+
#d.print_tree
|
37
|
+
#s = document 'http://www.economist.com'
|
38
|
+
|
39
|
+
p = phrase 'hello', 'world', '!'
|
40
|
+
puts p.to_s
|
41
|
+
puts p.to_str
|
42
|
+
=end
|
43
|
+
|
44
|
+
=begin
|
45
|
+
### Super basics.
|
46
|
+
puts p.value
|
47
|
+
|
48
|
+
p << 'bitch'
|
49
|
+
p << word('hello')
|
50
|
+
puts p.to_s
|
51
|
+
puts p.to_str
|
52
|
+
puts p.value
|
53
|
+
puts p.to_ary.inspect
|
54
|
+
=end
|
55
|
+
|
56
|
+
=begin
|
57
|
+
|
58
|
+
### Configuration
|
59
|
+
|
60
|
+
# A boolean value indicating whether to silence the output of external libraries (e.g. Stanford tools, Enju, LDA, Ruby-FANN) when they are used.
|
61
|
+
puts Treat.core.verbosity.silence
|
62
|
+
# A boolean value indicating whether to explain the steps that Treat is performing.
|
63
|
+
puts Treat.core.verbosity.debug
|
64
|
+
# A boolean value indicating whether Treat should try to detect the language of newly input text.
|
65
|
+
puts Treat.core.language.detect
|
66
|
+
# The language to default to when detection is off.
|
67
|
+
puts Treat.core.language.default
|
68
|
+
# A symbol representing the finest level at which language detection should be performed if language detection is turned on.
|
69
|
+
puts Treat.core.language.detect_at
|
70
|
+
|
71
|
+
# A directory in which to create temporary files.
|
72
|
+
puts Treat.paths.tmp
|
73
|
+
# A directory in which to store downloaded files.
|
74
|
+
puts Treat.paths.files
|
75
|
+
# A directory containing trained models for various tasks.
|
76
|
+
puts Treat.paths.models
|
77
|
+
# A directory containing the spec files.
|
78
|
+
puts Treat.paths.spec
|
79
|
+
# A directory containing executables and JAR files.
|
80
|
+
puts Treat.paths.bin
|
81
|
+
puts Treat.paths.lib
|
82
|
+
|
83
|
+
# Set up Mongoid.
|
84
|
+
Treat.databases.mongo.db = 'your_database'
|
85
|
+
Treat.databases.mongo.host = 'localhost'
|
86
|
+
Treat.databases.mongo.port = '27017'
|
87
|
+
|
88
|
+
# Transparent string casting.
|
89
|
+
s = 'inflection'.stem
|
90
|
+
# is equivalent to
|
91
|
+
s = 'inflection'.to_entity.stem
|
92
|
+
# which comes down to
|
93
|
+
s = word('inflection').stem
|
94
|
+
|
95
|
+
# Transparent number casting.
|
96
|
+
n = 2.ordinal
|
97
|
+
# is equivalent to
|
98
|
+
s = 2.to_entity.ordinal
|
99
|
+
# which comes down to
|
100
|
+
s = number(2).ordinal
|
101
|
+
=end
|
2
102
|
=begin
|
3
|
-
|
4
|
-
Question(:is_key_sentence, :sentence, :discrete, 0, [0, 1]),
|
5
|
-
Feature(:word_count, 0),
|
6
|
-
Tag(:number_count, 0)
|
7
|
-
)
|
103
|
+
### BASIC USAGE
|
8
104
|
|
9
|
-
|
105
|
+
# Create a sentence
|
106
|
+
s = sentence 'Those who dream by day know of at least ' +
|
107
|
+
'19 things that escape those who dream only at night.'
|
10
108
|
|
11
|
-
|
109
|
+
# Tokenize and tag it.
|
110
|
+
s.tokenize.tag
|
12
111
|
|
13
|
-
|
112
|
+
# View the sentence structure.
|
113
|
+
s.print_tree
|
14
114
|
|
15
|
-
|
16
|
-
|
17
|
-
|
115
|
+
# Iterate over the tokens.
|
116
|
+
s.each_token do |tok|
|
117
|
+
puts tok.value
|
118
|
+
puts tok.type
|
119
|
+
end
|
18
120
|
|
19
|
-
ds = test.export(problem)
|
20
121
|
|
21
|
-
|
22
|
-
|
122
|
+
|
123
|
+
# Arrays instead of iterators.
|
124
|
+
(s.nouns + s.adjectives).each do |word|
|
125
|
+
puts word.synonyms
|
126
|
+
puts word.antonyms
|
127
|
+
end
|
128
|
+
|
129
|
+
# Functions on numbers.
|
130
|
+
s.each_number do |num|
|
131
|
+
puts num.ordinal
|
132
|
+
puts num.cardinal
|
23
133
|
end
|
134
|
+
|
135
|
+
# See all the annotations.
|
136
|
+
s.each do |tok|
|
137
|
+
puts tok.inspect
|
138
|
+
end
|
139
|
+
|
140
|
+
# Lazy way of doing all of the above.
|
141
|
+
s = sentence 'Those who dream by day know of at least ' +
|
142
|
+
'19 things that escape those who dream only at night.'
|
143
|
+
|
144
|
+
s.apply :tokenize, :tag, :category,
|
145
|
+
:stem, :hyponyms, :hypernyms,
|
146
|
+
:antonyms, :ordinal, :cardinal
|
147
|
+
|
24
148
|
=end
|
149
|
+
|
25
150
|
=begin
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
151
|
+
### A BIT MORE ADVANCED USAGE
|
152
|
+
|
153
|
+
section = section "Obama-Sarkozy Meeting\n" +
|
154
|
+
"Obama and Sarkozy met on January 1st to investigate " +
|
155
|
+
"the possibility of a new rescue plan. President " +
|
156
|
+
"Sarkozy is to meet Merkel next Tuesday in Berlin."
|
157
|
+
|
158
|
+
# Chunk: split the titles and paragraphs.
|
159
|
+
# Segment: perform sentence segmentation.
|
160
|
+
# Parse: parse the syntax of each sentence.
|
161
|
+
section.apply :chunk, :segment, :parse
|
162
|
+
|
163
|
+
# View the tree structure.
|
164
|
+
section.print_tree
|
32
165
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
166
|
+
# Get some basic info on the text.
|
167
|
+
puts section.title
|
168
|
+
puts section.sentence_count
|
169
|
+
puts section.word_count
|
170
|
+
|
171
|
+
section.apply :category
|
172
|
+
puts section.noun_count
|
173
|
+
puts section.frequency_of 'president'
|
174
|
+
|
175
|
+
section.each_phrase_with_tag('NP') do |phrase|
|
176
|
+
puts phrase.to_s
|
177
|
+
end
|
178
|
+
|
179
|
+
=end
|
180
|
+
=begin
|
181
|
+
### URL documents, XML serialization.
|
182
|
+
|
183
|
+
urls = ['http://www.cbc.ca/news/world/story/2012/11/25/snc-lavalin-ben-aissa-charges.html',
|
184
|
+
'http://www.cbc.ca/news/world/story/2012/11/25/egypt.html', 'http://www.cbc.ca/news/canada/prince-edward-island/story/2012/11/25/pei-murder-arrest-stlucia.html', 'http://www.cbc.ca/news/world/story/2012/11/25/bangladesh-garment-factory-fire.html']
|
185
|
+
|
186
|
+
c = collection
|
187
|
+
urls.each { |url| c << document(url) }
|
188
|
+
|
189
|
+
# View the collection.
|
190
|
+
c.print_tree
|
191
|
+
|
192
|
+
c.apply :chunk, :segment, :tokenize
|
193
|
+
c.serialize :xml, :file => 'test.xml'
|
194
|
+
|
195
|
+
# Reopen the collection.
|
196
|
+
c = collection 'test.xml'
|
197
|
+
|
198
|
+
# View it again.
|
199
|
+
c.print_tree
|
200
|
+
=end
|
201
|
+
=begin
|
202
|
+
include Treat::Core::DSL
|
203
|
+
|
204
|
+
# Show progress bars for download.
|
205
|
+
Treat.core.verbosity.silence = false
|
206
|
+
# Explain what Treat is doing.
|
207
|
+
Treat.core.verbosity.debug = true
|
208
|
+
|
209
|
+
# Define the question "is it junk?" on sentences.
|
210
|
+
qn = question(:is_junk, :sentence)
|
211
|
+
|
212
|
+
# Frame the problem as depending on punctuation
|
213
|
+
# count and word count for each sentence.
|
214
|
+
pb = problem(qn,
|
215
|
+
feature(:punctuation_count),
|
216
|
+
feature(:word_count) )
|
217
|
+
|
218
|
+
# Get some web documents to work on.
|
219
|
+
url1 = 'http://en.wikipedia.org/wiki/NOD_mouse'
|
220
|
+
url2 = 'http://en.wikipedia.org/wiki/Academic_studies_about_Wikipedia'
|
221
|
+
d1, d2 = document(url1), document(url2)
|
222
|
+
|
223
|
+
# Process both of our documents.
|
224
|
+
[d1,d2].apply(:chunk, :segment, :tokenize)
|
225
|
+
|
226
|
+
# Answer our problem to create a training set.
|
227
|
+
d1.sentences[0..17].each { |s| s.set :is_junk, 0 }
|
228
|
+
d1.sentences[17..-1].each { |s| s.set :is_junk, 1 }
|
229
|
+
d_set = d1.export(pb)
|
230
|
+
|
231
|
+
# Define our gold standard results for evaluation.
|
232
|
+
d2.sentences[0..81].each { |s| s.set :is_true_junk, 0 }
|
233
|
+
d2.sentences[81..-1].each { |s| s.set :is_true_junk, 1 }
|
234
|
+
|
235
|
+
tp, fp, tn, fn = 0.0, 0.0, 0.0, 0.0
|
236
|
+
|
237
|
+
d2.sentences.map do |s|
|
238
|
+
pred = s.classify(:id3, training: d_set)
|
239
|
+
if pred == 1
|
240
|
+
tp += 1 if s.is_true_junk == 1
|
241
|
+
fp += 1 if s.is_true_junk == 0
|
242
|
+
else
|
243
|
+
tn += 1 if s.is_true_junk == 0
|
244
|
+
fn += 1 if s.is_true_junk == 1
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
puts "Precision: #{tp/(tp + fp)}"
|
249
|
+
puts "Recall: #{tp/(tp + fn)}"
|
250
|
+
=end
|
251
|
+
=begin
|
252
|
+
d = document 'http://louismullie.com/susan-text-scan1.jpg'
|
253
|
+
d.apply :chunk, :segment, :tokenize
|
254
|
+
d.print_tree
|
255
|
+
=end
|
256
|
+
=begin
|
257
|
+
# Syntax example
|
258
|
+
phra = phrase 'Obama', 'Sarkozy', 'Meeting'
|
259
|
+
|
260
|
+
para = paragraph 'Obama and Sarkozy met on January 1st to'
|
261
|
+
'investigate the possibility of a new rescue plan. Nicolas ' +
|
262
|
+
'Sarkozy is to meet Merkel next Tuesday in Berlin.'
|
263
|
+
|
264
|
+
sect = section title(phra), para
|
265
|
+
=end
|
266
|
+
=begin
|
267
|
+
puts "beer".plural.inspect
|
268
|
+
=end
|
269
|
+
p = paragraph
|
data/spec/treat.rb
CHANGED
@@ -2,44 +2,36 @@ require_relative 'helper'
|
|
2
2
|
|
3
3
|
describe Treat do
|
4
4
|
|
5
|
-
describe "Syntactic sugar:"
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
Treat.core.syntax.sweetened.should eql true
|
20
|
-
|
21
|
-
Object.method_defined?(
|
22
|
-
:"#{type.to_s.capitalize}").
|
23
|
-
should eql true
|
24
|
-
|
25
|
-
Treat::Config.unsweeten!
|
26
|
-
Treat.core.syntax.sweetened.should eql false
|
27
|
-
|
28
|
-
Object.method_defined?(
|
29
|
-
type.to_s.capitalize.intern).should eql false
|
30
|
-
|
31
|
-
Object.method_defined?(
|
32
|
-
:"#{type.to_s.capitalize}").
|
33
|
-
should eql false
|
5
|
+
describe "Syntactic sugar:" do
|
6
|
+
|
7
|
+
describe "#sweeten!, #unsweeten!" do
|
8
|
+
it "respectively turn on and off syntactic sugar and " +
|
9
|
+
"define/undefine entity builders as uppercase methods " +
|
10
|
+
"in the global namespace" do
|
11
|
+
Treat.core.entities.list.each do |type|
|
12
|
+
next if type == :symbol
|
13
|
+
|
14
|
+
Treat::Config.sweeten!
|
15
|
+
Treat.core.syntax.sweetened.should eql true
|
16
|
+
Object.method_defined?(
|
17
|
+
:"#{type.to_s.capitalize}").
|
18
|
+
should eql true
|
34
19
|
|
20
|
+
Treat::Config.unsweeten!
|
21
|
+
Treat.core.syntax.sweetened.should eql false
|
22
|
+
Object.method_defined?(
|
23
|
+
type.to_s.capitalize.intern).should eql false
|
24
|
+
Object.method_defined?(
|
25
|
+
:"#{type.to_s.capitalize}").
|
26
|
+
should eql false
|
27
|
+
end
|
35
28
|
end
|
36
|
-
|
37
29
|
end
|
38
|
-
|
30
|
+
|
39
31
|
end
|
40
|
-
|
32
|
+
|
41
33
|
describe "Paths:" do
|
42
|
-
|
34
|
+
|
43
35
|
paths = Treat.core.paths.description
|
44
36
|
# Check IO for bin, files, tmp, models. Fix.
|
45
37
|
paths.each_pair do |path, files|
|
@@ -49,7 +41,7 @@ describe Treat do
|
|
49
41
|
end
|
50
42
|
end
|
51
43
|
end
|
52
|
-
|
44
|
+
|
53
45
|
end
|
54
46
|
|
55
47
|
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
class Treat::Specs::Workers::Agnostic < Treat::Specs::Workers::Language
|
2
|
+
|
3
|
+
# TODO: :tf_idf, :keywords, :classifiers
|
4
|
+
# :read,. :unserialize
|
5
|
+
|
6
|
+
Scenarios = {
|
7
|
+
|
8
|
+
# Also tests unserialize.
|
9
|
+
serialize: {
|
10
|
+
entity: {
|
11
|
+
examples: [
|
12
|
+
["A test entity.", "A test entity."]
|
13
|
+
],
|
14
|
+
generator: lambda { |selector| Treat::Entities::Entity.build(selector).to_s }
|
15
|
+
}
|
16
|
+
},
|
17
|
+
classify: {
|
18
|
+
entity: {
|
19
|
+
examples: [
|
20
|
+
["Homer", 1, lambda { {training: Treat::Learning::DataSet.build('test.marshal')} }]
|
21
|
+
],
|
22
|
+
preprocessor: lambda do |entity|
|
23
|
+
ds = Treat::Learning::DataSet.new(
|
24
|
+
Treat::Learning::Problem.new(
|
25
|
+
Treat::Learning::Question.new(:is_person, :word, :false, :discrete),
|
26
|
+
Treat::Learning::Feature.new(:first_capital, 0, "->(e) { (e.to_s[0] =~ /^[A-Z]$/) ? 1 : 0 }"),
|
27
|
+
Treat::Learning::Tag.new(:value, 0)
|
28
|
+
))
|
29
|
+
w1, w2, w3, w4, w5 =
|
30
|
+
["Alfred", "lucky", "Hobbit", "hello", "Alice"].
|
31
|
+
map { |w| Treat::Entities::Word.new(w) }
|
32
|
+
w1.set :is_person, 1
|
33
|
+
w2.set :is_person, 0
|
34
|
+
w3.set :is_person, 1
|
35
|
+
w4.set :is_person, 0
|
36
|
+
w5.set :is_person, 1
|
37
|
+
ds << w1; ds << w2; ds << w3
|
38
|
+
ds.serialize :marshal, file: 'test.marshal'
|
39
|
+
end
|
40
|
+
}
|
41
|
+
},
|
42
|
+
visualize: {
|
43
|
+
entity: {
|
44
|
+
examples: {
|
45
|
+
standoff: [
|
46
|
+
["I walked to the store.", "(S\n (PRP I) (VBD walked) (TO to) (DT the) (NN store) (. .))\n"]
|
47
|
+
],
|
48
|
+
tree: [
|
49
|
+
["I walked to the store.", "+ Sentence (*) --- \"I walked to the store.\" --- {} --- [] \n|\n+--> Word (*) --- \"I\" --- {} --- [] \n+--> Word (*) --- \"walked\" --- {} --- [] \n+--> Word (*) --- \"to\" --- {} --- [] \n+--> Word (*) --- \"the\" --- {} --- [] \n+--> Word (*) --- \"store\" --- {} --- [] \n+--> Punctuation (*) --- \".\" --- {} --- [] "]
|
50
|
+
],
|
51
|
+
dot: [
|
52
|
+
["I walked to the store.", "graph {\n* [label=\"Sentence\\n\\\"I walked to the store.\\\"\",color=\"\"]\n* [label=\"Word\\n\\\"I\\\"\",color=\"\"]\n* -- *;\n* [label=\"Word\\n\\\"walked\\\"\",color=\"\"]\n* -- *;\n* [label=\"Word\\n\\\"to\\\"\",color=\"\"]\n* -- *;\n* [label=\"Word\\n\\\"the\\\"\",color=\"\"]\n* -- *;\n* [label=\"Word\\n\\\"store\\\"\",color=\"\"]\n* -- *;\n* [label=\"Punctuation\\n\\\".\\\"\",color=\"\"]\n* -- *;\n}"]
|
53
|
+
]
|
54
|
+
},
|
55
|
+
preprocessor: lambda { |entity| entity.tokenize },
|
56
|
+
generator: lambda { |result| result.gsub(/[0-9]+/, '*') }
|
57
|
+
}
|
58
|
+
},
|
59
|
+
|
60
|
+
=begin
|
61
|
+
keywords: {
|
62
|
+
document: {
|
63
|
+
examples: [
|
64
|
+
["./spec/workers/examples/english/economist/saving_the_euro.odt",
|
65
|
+
["crisis", "government", "called", "financial", "funds", "treaty"]]
|
66
|
+
],
|
67
|
+
preprocessor: lambda do |document|
|
68
|
+
coll = Treat::Entities::Collection.build('./spec/workers/examples/english/economist/')
|
69
|
+
coll << document
|
70
|
+
coll.apply(:chunk, :segment, :tokenize, :keywords)
|
71
|
+
document
|
72
|
+
end
|
73
|
+
},
|
74
|
+
section: {
|
75
|
+
examples: [
|
76
|
+
["A test phrase", ["A", "test", "phrase"]]
|
77
|
+
]
|
78
|
+
},
|
79
|
+
zone: {
|
80
|
+
examples: [
|
81
|
+
["A test phrase", ["A", "test", "phrase"]]
|
82
|
+
]
|
83
|
+
}
|
84
|
+
},
|
85
|
+
=end
|
86
|
+
=begin
|
87
|
+
unserialize: {
|
88
|
+
examples: [
|
89
|
+
["A test entity.", "A test entity."]
|
90
|
+
],
|
91
|
+
generator: lambda { |selector| Treat::Entities::Entity.build(selector).to_s }
|
92
|
+
},
|
93
|
+
=end
|
94
|
+
=begin
|
95
|
+
# Index
|
96
|
+
search: {
|
97
|
+
collection: {
|
98
|
+
examples: [
|
99
|
+
["./spec/workers/examples/english/economist/",
|
100
|
+
"Hungary's troubles", {query: 'Hungary'}]
|
101
|
+
],
|
102
|
+
generator: lambda { |docs| docs[0].titles[0] },
|
103
|
+
preprocessor: lambda { |coll| coll.apply(:index) }
|
104
|
+
},
|
105
|
+
},
|
106
|
+
=end
|
107
|
+
=begin
|
108
|
+
keywords: {
|
109
|
+
document: {
|
110
|
+
examples: [
|
111
|
+
["./spec/languages/english/economist/saving_the_euro.odt",
|
112
|
+
["A", "test", "phrase"]]
|
113
|
+
],
|
114
|
+
preprocessor: lambda { |doc| doc.parent = Collection('./spec/languages/english/economist/') }
|
115
|
+
},
|
116
|
+
section: {
|
117
|
+
examples: [
|
118
|
+
["A test phrase", ["A", "test", "phrase"]]
|
119
|
+
]
|
120
|
+
},
|
121
|
+
zone: {
|
122
|
+
examples: [
|
123
|
+
["A test phrase", ["A", "test", "phrase"]]
|
124
|
+
]
|
125
|
+
}
|
126
|
+
},
|
127
|
+
=end
|
128
|
+
topic_words: {
|
129
|
+
collection: {
|
130
|
+
examples: [
|
131
|
+
["./spec/workers/examples/english/economist", [["orban", "minister", "bajnai", "mr", "government", "president", "law", "brussels", "commission", "hu"], ["government", "minister", "fidesz", "mr", "hvg", "today", "hungarian", "bajnai", "national", "office"], ["mr", "today", "central", "minister", "crisis", "prime", "president", "bank", "european", "government"], ["sarkozy", "mr", "greece", "german", "summit", "france", "merkel", "opera", "growth", "euro"], ["central", "hand", "minister", "week", "bank", "forced", "hungarian", "parliament", "political", "hvg"], ["minister", "crisis", "central", "bank", "hand", "law", "forced", "bajnai", "parliament", "president"], ["mr", "bank", "european", "central", "government", "called", "today", "financial", "policies", "press"], ["mr", "crisis", "government", "central", "today", "funds", "president", "issues", "bank", "called"], ["mr", "crisis", "minister", "today", "european", "prime", "financial", "president", "issues", "treaty"], ["central", "minister", "mr", "bajnai", "orban", "bank", "parliament", "week", "fidesz", "washington"], ["mr", "central", "government", "crisis", "minister", "orban", "hand", "fidesz", "bajnai", "judicial"], ["mr", "sarkozy", "chancellor", "government", "european", "merkozy", "role", "mrs", "interest", "quickly"], ["mr", "orban", "government", "crisis", "hungarian", "independence", "prime", "today", "hand", "bajnai"], ["euro", "fiscal", "merkel", "mrs", "sarkozy", "mr", "european", "zone", "leaders", "chancellor"], ["mr", "bank", "crisis", "financial", "president", "funds", "government", "treaty", "central", "part"], ["mr", "central", "minister", "crisis", "prime", "european", "government", "bank", "treaty", "issues"], ["sarkozy", "fiscal", "merkel", "mrs", "growth", "zone", "german", "role", "paper", "quickly"], ["mr", "government", "orban", "bank", "bajnai", "hungarian", "prime", "-", "hu", "commission"], ["mr", "orban", "today", "bank", "minister", "national", "government", "-", "crisis", "forced"], ["role", "summit", "merkel", "euro", "zone", "german", "mr", "greece", "sarkozy", "step"]]]
|
132
|
+
]
|
133
|
+
}
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
end
|