treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
data/lib/treat/exception.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Treat
|
2
2
|
# Custom exception class for the Treat toolkit.
|
3
3
|
# Used to distinguish between errors raised by
|
4
|
-
# gems
|
5
|
-
class Exception < ::Exception
|
6
|
-
end
|
7
|
-
end
|
4
|
+
# gems/Ruby from errors raised by the toolkit.
|
5
|
+
class Exception < ::Exception; end
|
6
|
+
class InvalidInputException < Exception; end
|
7
|
+
end
|
data/lib/treat/extractors.rb
CHANGED
@@ -1,82 +1,79 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
self.default = :what_language
|
11
|
-
end
|
12
|
-
# Extracts the time of an object and annotates it
|
13
|
-
# with specific information regarding time.
|
14
|
-
module Time
|
15
|
-
extend Group
|
16
|
-
self.type = :annotator
|
17
|
-
self.targets = [:phrase]
|
18
|
-
end
|
19
|
-
# Extracts the time of an object and annotates it
|
20
|
-
# with specific information regarding time.
|
21
|
-
module Date
|
22
|
-
extend Group
|
23
|
-
self.type = :annotator
|
24
|
-
self.targets = [:phrase]
|
25
|
-
end
|
26
|
-
# Extract the topic from a text.
|
27
|
-
module Topics
|
28
|
-
extend Group
|
29
|
-
self.type = :annotator
|
30
|
-
self.targets = [:document, :zone]
|
31
|
-
end
|
32
|
-
# Extract the keywords from a text.
|
33
|
-
module Keywords
|
34
|
-
extend Group
|
35
|
-
self.type = :annotator
|
36
|
-
self.targets = [:document, :zone]
|
37
|
-
end
|
38
|
-
# Extract the topic words from a text.
|
39
|
-
module TopicWords
|
40
|
-
extend Group
|
41
|
-
self.type = :annotator
|
42
|
-
self.targets = [:collection]
|
43
|
-
end
|
44
|
-
# Extract named entities from texts.
|
45
|
-
module NamedEntityTag
|
46
|
-
extend Group
|
47
|
-
self.type = :annotator
|
48
|
-
self.targets = [:phrase, :word]
|
49
|
-
end
|
50
|
-
# Extract named entities from texts.
|
51
|
-
module Coreferences
|
52
|
-
extend Group
|
53
|
-
self.type = :annotator
|
54
|
-
self.targets = [:zone]
|
55
|
-
end
|
56
|
-
# This module should be moved out of here ASAP.
|
57
|
-
module Statistics
|
58
|
-
extend Group
|
59
|
-
self.type = :annotator
|
60
|
-
self.targets = [:word]
|
61
|
-
self.default = :none
|
62
|
-
self.preprocessors = {
|
63
|
-
:frequency_in => lambda do |entity, worker, options|
|
64
|
-
options = {:parent => worker}.merge(options)
|
65
|
-
entity.statistics(:frequency_in, options)
|
66
|
-
end,
|
67
|
-
:tf_idf => lambda do |entity, worker, options|
|
68
|
-
entity.statistics(:tf_idf, options)
|
69
|
-
end,
|
70
|
-
:position_in => lambda do |entity, options|
|
71
|
-
entity.statistics(:position_in, options)
|
72
|
-
end
|
73
|
-
}
|
74
|
-
end
|
75
|
-
module Roles
|
76
|
-
extend Group
|
77
|
-
self.type = :annotator
|
78
|
-
self.targets = [:phrase]
|
79
|
-
end
|
80
|
-
extend Treat::Category
|
1
|
+
# Extractors extract information out of texts.
|
2
|
+
module Treat::Extractors
|
3
|
+
|
4
|
+
# Extracts the language from an entity.
|
5
|
+
module Language
|
6
|
+
extend Treat::Groupable
|
7
|
+
self.type = :annotator
|
8
|
+
self.targets = [:entity]
|
9
|
+
self.default = :what_language
|
81
10
|
end
|
11
|
+
|
12
|
+
# Extracts the date/time of a phrase.
|
13
|
+
module Time
|
14
|
+
extend Treat::Groupable
|
15
|
+
self.type = :annotator
|
16
|
+
self.targets = [:phrase]
|
17
|
+
end
|
18
|
+
|
19
|
+
# Extract the topic from a document or zone.
|
20
|
+
module Topics
|
21
|
+
extend Treat::Groupable
|
22
|
+
self.type = :annotator
|
23
|
+
self.targets = [:document]
|
24
|
+
end
|
25
|
+
|
26
|
+
# Extract the keywords from a text.
|
27
|
+
module Keywords
|
28
|
+
extend Treat::Groupable
|
29
|
+
self.type = :annotator
|
30
|
+
self.targets = [:document]
|
31
|
+
end
|
32
|
+
|
33
|
+
# Extract clusters of topic words from a collection.
|
34
|
+
module TopicWords
|
35
|
+
extend Treat::Groupable
|
36
|
+
self.type = :annotator
|
37
|
+
self.targets = [:collection]
|
38
|
+
end
|
39
|
+
|
40
|
+
# Extract named entities from phrases.
|
41
|
+
module NameTag
|
42
|
+
extend Treat::Groupable
|
43
|
+
self.type = :annotator
|
44
|
+
self.targets = [:phrase, :word]
|
45
|
+
end
|
46
|
+
|
47
|
+
# Extract coreferences from a zone.
|
48
|
+
module Coreferences
|
49
|
+
extend Treat::Groupable
|
50
|
+
self.type = :annotator
|
51
|
+
self.targets = [:zone]
|
52
|
+
end
|
53
|
+
|
54
|
+
# Retrieve the main grammatical roles
|
55
|
+
# in the phrase (subject, verb, object).
|
56
|
+
module Roles
|
57
|
+
extend Treat::Groupable
|
58
|
+
self.type = :annotator
|
59
|
+
self.targets = [:phrase]
|
60
|
+
end
|
61
|
+
|
62
|
+
module TfIdf
|
63
|
+
extend Treat::Groupable
|
64
|
+
self.type = :annotator
|
65
|
+
self.targets = [:word]
|
66
|
+
self.default = :native
|
67
|
+
end
|
68
|
+
|
69
|
+
module Summary
|
70
|
+
extend Treat::Groupable
|
71
|
+
self.type = :annotator
|
72
|
+
self.targets = [:document]
|
73
|
+
self.default = :keyword_count
|
74
|
+
end
|
75
|
+
|
76
|
+
# Make Extractors categorizable.
|
77
|
+
extend Treat::Categorizable
|
78
|
+
|
82
79
|
end
|
@@ -1,26 +1,60 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
1
|
+
# This retrieves a supplied number of keywords
|
2
|
+
# by selecting the N words with the highest TF*IDF
|
3
|
+
# for each document.
|
4
|
+
class Treat::Extractors::Keywords::TfIdf
|
5
|
+
|
6
|
+
# Default options - retrieve 5 keywords.
|
7
|
+
DefaultOptions = { :number => 5 }
|
8
|
+
|
9
|
+
# Annotate a document with an array containing
|
10
|
+
# the N words with the highest TF*IDF in that
|
11
|
+
# document,
|
12
|
+
def self.keywords(entity, options = {})
|
13
|
+
|
14
|
+
options = DefaultOptions.merge(options)
|
15
|
+
tf_idfs = {}
|
16
|
+
entity.each_word do |word|
|
17
|
+
word.check_has(:tf_idf, false)
|
18
|
+
tf_idfs[word] ||= word.get(:tf_idf)
|
19
|
+
end
|
20
|
+
|
21
|
+
tf_idfs = tf_idfs.
|
22
|
+
sort_by {|k,v| v}.reverse
|
23
|
+
|
24
|
+
if tf_idfs.size <= options[:number]
|
25
|
+
return tf_idfs
|
26
|
+
end
|
27
|
+
|
28
|
+
keywords = []
|
29
|
+
i = 0
|
30
|
+
|
31
|
+
tf_idfs.each do |word|
|
32
|
+
|
33
|
+
w = word[0].to_s
|
34
|
+
next if keywords.include?(w)
|
35
|
+
|
36
|
+
entity.each_word_with_value(w) do |w2|
|
37
|
+
|
38
|
+
ps = w2.parent_phrase
|
39
|
+
|
40
|
+
if ps.has?(:keyword_count)
|
41
|
+
ps.set :keyword_count,
|
42
|
+
ps.keyword_count + 1
|
43
|
+
else
|
44
|
+
ps.set :keyword_count, 1
|
22
45
|
end
|
46
|
+
ps.set :keyword_density,
|
47
|
+
(ps.keyword_count / ps.size)
|
48
|
+
|
23
49
|
end
|
50
|
+
|
51
|
+
break if i > options[:number]
|
52
|
+
keywords << w
|
53
|
+
|
54
|
+
i += 1
|
24
55
|
end
|
56
|
+
|
57
|
+
keywords
|
25
58
|
end
|
26
|
-
|
59
|
+
|
60
|
+
end
|
@@ -1,49 +1,54 @@
|
|
1
|
-
module Treat
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
1
|
+
module Treat::Extractors::Language
|
2
|
+
|
3
|
+
# Adaptor for the 'whatlanguage' gem, which
|
4
|
+
# performs probabilistic language detection.
|
5
|
+
# The library works by checking for the presence
|
6
|
+
# of words with bloom filters built from
|
7
|
+
# dictionaries based upon each source language.
|
8
|
+
class WhatLanguage
|
9
|
+
|
10
|
+
# Require the 'whatlanguage' gem.
|
11
|
+
silence_warnings { require 'whatlanguage' }
|
12
|
+
|
13
|
+
# Undefine the method defined by the gem.
|
14
|
+
String.class_eval { undef :language }
|
15
|
+
|
16
|
+
# By default, bias towards common languages.
|
17
|
+
DefaultOptions = {
|
18
|
+
:bias => [:eng, :fre, :chi, :ger, :ara, :spa]
|
19
|
+
}
|
20
|
+
|
21
|
+
# Keep only once instance of the gem class.
|
22
|
+
@@detector = nil
|
23
|
+
|
24
|
+
# Detect the language of an entity using the
|
25
|
+
# 'whatlanguage' gem. Return an identifier
|
26
|
+
# corresponding to the ISO-639-2 code for the
|
27
|
+
# language.
|
28
|
+
#
|
29
|
+
# Options:
|
30
|
+
#
|
31
|
+
# - (Array of Symbols) bias => Languages to bias
|
32
|
+
# toward when more than one language is detected
|
33
|
+
# with equal probability.
|
34
|
+
def self.language(entity, options = {})
|
35
|
+
options = DefaultOptions.merge(options)
|
36
|
+
@@detector ||= ::WhatLanguage.new(:possibilities)
|
37
|
+
possibilities = @@detector.process_text(entity.to_s)
|
38
|
+
lang = {}
|
39
|
+
possibilities.each do |k,v|
|
40
|
+
lang[Treat::Languages.code(k)] = v
|
41
|
+
end
|
42
|
+
max = lang.values.max
|
43
|
+
ordered = lang.select { |i,j| j == max }.keys
|
44
|
+
ordered.each do |l|
|
45
|
+
if options[:bias].include?(l)
|
46
|
+
return l
|
45
47
|
end
|
46
48
|
end
|
49
|
+
return ordered.first
|
47
50
|
end
|
51
|
+
|
48
52
|
end
|
49
|
-
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Detects the named entity tag in sentences by using
|
2
|
+
# the stanford-core-nlp gem, which interfaces with
|
3
|
+
# the Stanford Deterministic Coreference Resolver.
|
4
|
+
class Treat::Extractors::NameTag::Stanford
|
5
|
+
|
6
|
+
require 'treat/loaders/stanford'
|
7
|
+
|
8
|
+
@@classifiers = {}
|
9
|
+
|
10
|
+
def self.name_tag(entity, options = {})
|
11
|
+
|
12
|
+
pp = nil
|
13
|
+
|
14
|
+
lang = entity.language
|
15
|
+
|
16
|
+
language = Treat::Languages.describe(lang)
|
17
|
+
|
18
|
+
isolated_token = entity.is_a?(Treat::Entities::Token)
|
19
|
+
tokens = isolated_token ? [entity] : entity.tokens
|
20
|
+
|
21
|
+
ms = StanfordCoreNLP::Config::Models[:ner][language]
|
22
|
+
ms = Treat.models + 'stanford/' +
|
23
|
+
StanfordCoreNLP::Config::ModelFolders[:ner] +
|
24
|
+
ms['3class']
|
25
|
+
|
26
|
+
@@classifiers[lang] ||=
|
27
|
+
StanfordCoreNLP::CRFClassifier.
|
28
|
+
getClassifier(ms)
|
29
|
+
|
30
|
+
token_list = StanfordCoreNLP.get_list(tokens)
|
31
|
+
sentence = @@classifiers[lang].classify_sentence(token_list)
|
32
|
+
|
33
|
+
i = 0
|
34
|
+
n = 0
|
35
|
+
|
36
|
+
sentence.each do |s_token|
|
37
|
+
tag = s_token.get(:answer).to_s.downcase
|
38
|
+
tag = nil if tag == 'o'
|
39
|
+
return tag if isolated_token
|
40
|
+
if tag
|
41
|
+
tokens[i].set :name_tag, tag
|
42
|
+
n += 1
|
43
|
+
end
|
44
|
+
i += 1
|
45
|
+
end
|
46
|
+
|
47
|
+
entity.set :named_entity_count, n
|
48
|
+
|
49
|
+
nil
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# Calculates the TF*IDF score of words.
|
2
|
+
module Treat::Extractors::TfIdf::Native
|
3
|
+
DefaultOptions = {
|
4
|
+
:tf => :natural,
|
5
|
+
:idf => :logarithm,
|
6
|
+
:remove_common_words => true,
|
7
|
+
:precision => 4
|
8
|
+
}
|
9
|
+
Algorithms = {
|
10
|
+
:tf => {
|
11
|
+
:natural => lambda { |tf| tf },
|
12
|
+
:logarithm => lambda { |tf| Math.log(1 + tf) },
|
13
|
+
:sqrt =>lambda { |tf| Math.sqrt(tf) }
|
14
|
+
},
|
15
|
+
:idf => {
|
16
|
+
:logarithm => lambda { |n,df| Math.log(n/(1 + df)) },
|
17
|
+
:none => lambda { |n,idf| 1 }
|
18
|
+
}
|
19
|
+
}
|
20
|
+
# Optimization caches for tf idf.
|
21
|
+
@@n = {} # Number of documents in the collection (n).
|
22
|
+
@@df= {} # Number of documents that have a given value (document count).
|
23
|
+
@@f = {} # Number of times a word appears in a given document (term count).
|
24
|
+
@@wc = {} # Number of words in a given document (word count).
|
25
|
+
@@cw = {} # Common words to filter out.
|
26
|
+
def self.tf_idf(entity, options={})
|
27
|
+
l = Treat::Languages.get(entity.language)
|
28
|
+
if l.const_defined?(:CommonWords)
|
29
|
+
@@cw[entity.language] =
|
30
|
+
l.const_get(:CommonWords)
|
31
|
+
return 0 if @@cw[entity.language].include?(entity.value)
|
32
|
+
end
|
33
|
+
return 0 if entity.value.length <= 2
|
34
|
+
options = DefaultOptions.merge(options)
|
35
|
+
lambdas = options.partition do |k,v|
|
36
|
+
[:tf, :idf, :normalization].include?(k)
|
37
|
+
end[0]
|
38
|
+
lambdas.each do |opt,val|
|
39
|
+
if opt.is_a?(Symbol)
|
40
|
+
if Algorithms[opt][val]
|
41
|
+
options[opt] = Algorithms[opt][val]
|
42
|
+
else
|
43
|
+
raise Treat::Exception,
|
44
|
+
"The specified algorithm '#{val}' "+
|
45
|
+
"to calculate #{opt} does not exist."
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
collection = entity.parent_collection
|
50
|
+
unless collection
|
51
|
+
raise Treat::Exception, "Cannot get the TF*IDF scores " +
|
52
|
+
"for a document that is not in a collection."
|
53
|
+
end
|
54
|
+
document = entity.parent_document
|
55
|
+
dc = collection.document_count
|
56
|
+
if !collection || !document
|
57
|
+
raise Treat::Exception,
|
58
|
+
"Tf*Idf requires a collection with documents."
|
59
|
+
end
|
60
|
+
val = entity.value.downcase
|
61
|
+
@@n[collection.id] = dc if @@n[collection.id].nil?
|
62
|
+
@@df[collection.id] ||= {}
|
63
|
+
if @@df[collection.id][val].nil?
|
64
|
+
df = 0
|
65
|
+
collection.each_document do |doc|
|
66
|
+
@@f[doc.id] ||= {}
|
67
|
+
if @@f[doc.id][val].nil?
|
68
|
+
@@f[doc.id][val] =
|
69
|
+
doc.frequency_of(val)
|
70
|
+
end
|
71
|
+
df += 1 if @@f[doc.id][val] > 0
|
72
|
+
end
|
73
|
+
@@df[collection.id][val] = df
|
74
|
+
end
|
75
|
+
f = @@f[document.id][entity.value].to_f
|
76
|
+
df = @@df[collection.id][entity.value].to_f
|
77
|
+
tf = options[:tf].call(f).to_f
|
78
|
+
if options[:normalize_word_count]
|
79
|
+
@@wc[document.id] ||= document.word_count
|
80
|
+
tf /= @@wc[document.id]
|
81
|
+
end
|
82
|
+
n = @@n[collection.id].to_f
|
83
|
+
idf = options[:idf].call(n, df)
|
84
|
+
tf_idf = tf * idf
|
85
|
+
tf_idf.abs.round(options[:precision])
|
86
|
+
end
|
87
|
+
end
|