treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
data/lib/treat/doable.rb
DELETED
@@ -1,45 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Doable
|
3
|
-
def do(*tasks)
|
4
|
-
tasks.each do |task|
|
5
|
-
if task.is_a?(Hash)
|
6
|
-
task.each do |k,v|
|
7
|
-
t, w = k, v
|
8
|
-
w, o = *w if w.is_a?(Array)
|
9
|
-
o ||= {}
|
10
|
-
do_task(t, w, o)
|
11
|
-
end
|
12
|
-
else
|
13
|
-
t = task.is_a?(Array) ? task[0] : task
|
14
|
-
w = task.is_a?(Array) ? task[1] : nil
|
15
|
-
w, o = *w if w.is_a?(Array)
|
16
|
-
o ||= {}
|
17
|
-
do_task(t, w, o)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
DEBUG = true
|
22
|
-
def do_task(task, worker, options)
|
23
|
-
group = Categories.lookup(task)
|
24
|
-
unless group
|
25
|
-
raise Treat::Exception, "Task #{task} does not exist."
|
26
|
-
end
|
27
|
-
entity_types = group.targets
|
28
|
-
f = nil
|
29
|
-
entity_types.each do |t|
|
30
|
-
f = true if Treat::Entities.match_types[t][type]
|
31
|
-
end
|
32
|
-
if f || entity_types.include?(:entity)
|
33
|
-
send(task, worker, options)
|
34
|
-
else
|
35
|
-
each_entity(*entity_types) do |entity|
|
36
|
-
entity.do_task(task, worker, options)
|
37
|
-
end
|
38
|
-
unless entity_types.include?(type)
|
39
|
-
features.delete(task)
|
40
|
-
end
|
41
|
-
nil
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
@@ -1,14 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Entities
|
3
|
-
# Represents a collection of texts.
|
4
|
-
class Collection < Entity
|
5
|
-
# Initialize the collection with a folder
|
6
|
-
# containing the texts of the collection.
|
7
|
-
def initialize(folder = nil)
|
8
|
-
super('', id)
|
9
|
-
@type = :collection
|
10
|
-
set :folder, folder
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Entities
|
3
|
-
# Represents any syntactic phrase of a sentence.
|
4
|
-
class Phrase < Entity
|
5
|
-
def initialize(value = '', id = nil)
|
6
|
-
super(value, id)
|
7
|
-
@type = :phrase
|
8
|
-
end
|
9
|
-
end
|
10
|
-
class Sentence < Phrase
|
11
|
-
def initialize(value = '', id = nil)
|
12
|
-
super(value, id)
|
13
|
-
@type = :sentence
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
@@ -1,61 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Entities
|
3
|
-
# Represents a terminal element in the text structure.
|
4
|
-
class Token < Entity
|
5
|
-
# All tokens are leafs.
|
6
|
-
def is_leaf?; true; end
|
7
|
-
def initialize(value = '', id = nil)
|
8
|
-
super(value, id)
|
9
|
-
@type = :token
|
10
|
-
end
|
11
|
-
end
|
12
|
-
# Represents a word.
|
13
|
-
class Word < Token
|
14
|
-
def initialize(value = '', id = nil)
|
15
|
-
super(value, id)
|
16
|
-
@type = :word
|
17
|
-
end
|
18
|
-
end
|
19
|
-
# Represents a clitic ('s).
|
20
|
-
class Clitic < Token
|
21
|
-
def initialize(value = '', id = nil)
|
22
|
-
super(value, id)
|
23
|
-
@type = :clitic
|
24
|
-
end
|
25
|
-
end
|
26
|
-
# Represents a number.
|
27
|
-
class Number < Token
|
28
|
-
# Convert the number to an integer.
|
29
|
-
def to_i; to_s.to_i; end
|
30
|
-
# Convert the number to a float.
|
31
|
-
def to_f; to_s.to_f; end
|
32
|
-
def initialize(value = '', id = nil)
|
33
|
-
super(value, id)
|
34
|
-
@type = :number
|
35
|
-
end
|
36
|
-
end
|
37
|
-
# Represents a punctuation sign.
|
38
|
-
class Punctuation < Token
|
39
|
-
def initialize(value = '', id = nil)
|
40
|
-
super(value, id)
|
41
|
-
@type = :punctuation
|
42
|
-
end
|
43
|
-
end
|
44
|
-
# Represents a character that is neither
|
45
|
-
# alphabetical, numerical or a punctuation
|
46
|
-
# character (e.g. @#$%&*).
|
47
|
-
class Symbol < Token
|
48
|
-
def initialize(value = '', id = nil)
|
49
|
-
super(value, id)
|
50
|
-
@type = :symbol
|
51
|
-
end
|
52
|
-
end
|
53
|
-
# Represents an entity of unknown type.
|
54
|
-
class Unknown < Token
|
55
|
-
def initialize(value = '', id = nil)
|
56
|
-
super(value, id)
|
57
|
-
@type = :unknown
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
data/lib/treat/entities/zones.rb
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Entities
|
3
|
-
# Represents a zone of text
|
4
|
-
# (Title, Paragraph, List, Quote).
|
5
|
-
class Zone < Entity
|
6
|
-
def initialize(value = '', id = nil)
|
7
|
-
super(value, id)
|
8
|
-
@type = :zone
|
9
|
-
end
|
10
|
-
end
|
11
|
-
# Represents a title, subtitle, logical header.
|
12
|
-
class Title < Zone
|
13
|
-
def initialize(value = '', id = nil)
|
14
|
-
super(value, id)
|
15
|
-
@type = :title
|
16
|
-
end
|
17
|
-
end
|
18
|
-
# Represents a paragraph.
|
19
|
-
class Paragraph < Zone
|
20
|
-
def initialize(value = '', id = nil)
|
21
|
-
super(value, id)
|
22
|
-
@type = :paragraph
|
23
|
-
end
|
24
|
-
end
|
25
|
-
# Represents a list.
|
26
|
-
class List < Zone
|
27
|
-
def initialize(value = '', id = nil)
|
28
|
-
super(value, id)
|
29
|
-
@type = :list
|
30
|
-
end
|
31
|
-
end
|
32
|
-
# Represents a section, usually with a title
|
33
|
-
# and at least one paragraph.
|
34
|
-
class Section < Zone
|
35
|
-
def initialize(value = '', id = nil)
|
36
|
-
super(value, id)
|
37
|
-
@type = :section
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
@@ -1,69 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module Coreferences
|
4
|
-
class Stanford
|
5
|
-
require 'stanford-core-nlp'
|
6
|
-
@@pipeline = nil
|
7
|
-
def self.coreferences(entity, options = {})
|
8
|
-
val = entity.to_s
|
9
|
-
if entity.has_children?
|
10
|
-
warn "The Stanford Coreference Resolver currently requires " +
|
11
|
-
"an unsegmented, untokenized block of text to work with. " +
|
12
|
-
"Removing and replacing all children of '#{entity.short_value}'."
|
13
|
-
entity.remove_all!
|
14
|
-
end
|
15
|
-
@@pipeline ||= ::StanfordCoreNLP.load(
|
16
|
-
:tokenize, :ssplit, :pos,
|
17
|
-
:lemma, :parse, :ner, :dcoref
|
18
|
-
)
|
19
|
-
text = ::StanfordCoreNLP::Text.new(entity.to_s)
|
20
|
-
@@pipeline.annotate(text)
|
21
|
-
clusters = {}
|
22
|
-
text.get(:sentences).each do |sentence|
|
23
|
-
s = Treat::Entities::Sentence.
|
24
|
-
from_string(sentence.get(:value).to_s, true)
|
25
|
-
sentence.get(:tokens).each do |token|
|
26
|
-
t = Treat::Entities::Token.
|
27
|
-
from_string(token.value.to_s)
|
28
|
-
tag = token.get(:named_entity_tag).
|
29
|
-
to_s.downcase
|
30
|
-
corefid = token.get(:coref_cluster_id).to_s
|
31
|
-
unless corefid == ''
|
32
|
-
clusters[corefid] ||= []
|
33
|
-
clusters[corefid] << t
|
34
|
-
t.set :coref_cluster_id, corefid
|
35
|
-
end
|
36
|
-
|
37
|
-
t.set :named_entity_tag,
|
38
|
-
tag.intern unless tag == 'o'
|
39
|
-
s << t
|
40
|
-
end
|
41
|
-
entity << s
|
42
|
-
end
|
43
|
-
entity.each_token do |token|
|
44
|
-
if token.has?(:coref_cluster_id)
|
45
|
-
id = token.coref_cluster_id
|
46
|
-
links = clusters[id].dup
|
47
|
-
links.delete(token)
|
48
|
-
token.unset(:coref_cluster_id)
|
49
|
-
next if links.empty?
|
50
|
-
token.set :coreferents, links
|
51
|
-
links.each do |target|
|
52
|
-
token.link(target, :refers_to)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
i = 0
|
57
|
-
coreferences = {}
|
58
|
-
clusters.each do |k,v|
|
59
|
-
unless !v || v.size == 1
|
60
|
-
coreferences[i] = v
|
61
|
-
i += 1
|
62
|
-
end
|
63
|
-
end
|
64
|
-
coreferences
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module Date
|
4
|
-
# A wrapper for the 'chronic' gem, which parses
|
5
|
-
# date information.
|
6
|
-
#
|
7
|
-
# Project website: http://chronic.rubyforge.org/
|
8
|
-
class Chronic
|
9
|
-
silence_warnings { require 'chronic' }
|
10
|
-
require 'date'
|
11
|
-
# Return the date information contained within the entity
|
12
|
-
# by parsing it with the 'chronic' gem.
|
13
|
-
#
|
14
|
-
# Options: none.
|
15
|
-
def self.date(entity, options = {})
|
16
|
-
date = nil
|
17
|
-
return if entity.has?(:time)
|
18
|
-
s = entity.to_s
|
19
|
-
s.gsub!('\/', '/')
|
20
|
-
s.strip!
|
21
|
-
silence_warnings do
|
22
|
-
date = ::Chronic.parse(s, {:guess => true})
|
23
|
-
end
|
24
|
-
entity.ancestors_with_type(:phrase).each do |a|
|
25
|
-
a.unset(:date) if a.has?(:date)
|
26
|
-
end
|
27
|
-
return date.to_date if date
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
@@ -1,25 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module Date
|
4
|
-
# A wrapper for Ruby's native date parsing.
|
5
|
-
class Ruby
|
6
|
-
require 'date'
|
7
|
-
# Return a DateTime object representing the date/date
|
8
|
-
# contained within the entity, using Ruby's native
|
9
|
-
# date/date parser.
|
10
|
-
#
|
11
|
-
# Options: none.
|
12
|
-
def self.date(entity, options = {})
|
13
|
-
begin
|
14
|
-
s = entity.to_s.strip
|
15
|
-
s.gsub!('\/', '/')
|
16
|
-
date = ::DateTime.parse(s)
|
17
|
-
date.to_date
|
18
|
-
rescue
|
19
|
-
nil
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
@@ -1,48 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module Keywords
|
4
|
-
class TopicsTfIdf
|
5
|
-
DefaultOptions = {num_keywords: 5, tf_idf_threshold: 0.5, topic_words: nil}
|
6
|
-
def self.keywords(entity, options = {})
|
7
|
-
options = DefaultOptions.merge(options)
|
8
|
-
unless options[:topic_words]
|
9
|
-
options[:topic_words] = entity.parent_collection.topic_words
|
10
|
-
end
|
11
|
-
if Treat::Entities.rank(entity.type) <
|
12
|
-
Treat::Entities.rank(:sentence)
|
13
|
-
raise Treat::Exception, 'Cannot get the key ' +
|
14
|
-
'sentences of an entity smaller than a sentence.'
|
15
|
-
else
|
16
|
-
find_keywords(entity, options)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
def self.find_keywords(entity, options)
|
20
|
-
keywords = []
|
21
|
-
entity.each_word do |word|
|
22
|
-
found = false
|
23
|
-
tf_idf = word.tf_idf
|
24
|
-
options[:topic_words].each do |i, topic_words|
|
25
|
-
next if keywords.include?(word.value)
|
26
|
-
if topic_words.include?(word.value)
|
27
|
-
found = true
|
28
|
-
if tf_idf > options[:tf_idf_threshold]
|
29
|
-
keywords << word.value
|
30
|
-
word.set :is_keyword?, found
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
i = 0
|
36
|
-
# Take a slice of keywords with i elements.
|
37
|
-
selected_keywords = []
|
38
|
-
keywords.each do |keyword|
|
39
|
-
break if i > options[:num_keywords]
|
40
|
-
selected_keywords << keyword
|
41
|
-
i += 1
|
42
|
-
end
|
43
|
-
selected_keywords
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module Language
|
4
|
-
# A generic language detector, which is called before
|
5
|
-
# any language detector and ensures that configuration
|
6
|
-
# options concerning language are enforced (e.g. returns
|
7
|
-
# the default language when Treat.detect_language is false).
|
8
|
-
class LanguageExtractor
|
9
|
-
def self.language(entity, options = {})
|
10
|
-
if entity.to_s =~ /^[[:digit:]]+$/
|
11
|
-
return Treat.default_language
|
12
|
-
end
|
13
|
-
if Treat.detect_language == false
|
14
|
-
return Treat.default_language
|
15
|
-
else
|
16
|
-
dlvl = Treat.language_detection_level
|
17
|
-
if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
|
18
|
-
entity.has_parent?
|
19
|
-
anc = entity.ancestor_with_type(dlvl)
|
20
|
-
return anc.language if anc
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
@@ -1,53 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module NamedEntityTag
|
4
|
-
class Stanford
|
5
|
-
require 'stanford-core-nlp'
|
6
|
-
StanfordCoreNLP.load_class('ArrayList', 'java.util')
|
7
|
-
StanfordCoreNLP.load_class('Word', 'edu.stanford.nlp.ling')
|
8
|
-
@@pipeline = nil
|
9
|
-
def self.named_entity_tag(entity, options = {})
|
10
|
-
pp = nil
|
11
|
-
if entity.is_a?(Treat::Entities::Token) &&
|
12
|
-
entity.has_parent?
|
13
|
-
pp = entity.parent_phrase
|
14
|
-
s = get_list(pp.tokens)
|
15
|
-
else
|
16
|
-
s = entity.to_s
|
17
|
-
end
|
18
|
-
|
19
|
-
@@pipeline ||= ::StanfordCoreNLP.load(
|
20
|
-
:tokenize, :ssplit, :pos, :lemma, :parse, :ner
|
21
|
-
)
|
22
|
-
|
23
|
-
text = ::StanfordCoreNLP::Text.new(s)
|
24
|
-
@@pipeline.annotate(text)
|
25
|
-
|
26
|
-
add_to = pp ? pp : entity
|
27
|
-
|
28
|
-
if entity.is_a?(Treat::Entities::Phrase)
|
29
|
-
text.get(:tokens).each do |token|
|
30
|
-
t = Treat::Entities::Token.from_string(token.value.to_s)
|
31
|
-
tag = token.get(:named_entity_tag).to_s.downcase
|
32
|
-
t.set :named_entity_tag, tag.intern unless tag == 'o'
|
33
|
-
add_to << t
|
34
|
-
end
|
35
|
-
elsif entity.is_a?(Treat::Entities::Token)
|
36
|
-
tag = text.get(:tokens).iterator.next.
|
37
|
-
get(:named_entity_tag).to_s.downcase
|
38
|
-
entity.set :named_entity_tag, tag.intern unless tag == 'o'
|
39
|
-
end
|
40
|
-
|
41
|
-
end
|
42
|
-
|
43
|
-
def self.get_list(words)
|
44
|
-
list = StanfordCoreNLP::ArrayList.new
|
45
|
-
words.each do |w|
|
46
|
-
list.add(StanfordCoreNLP::Word.new(w.to_s))
|
47
|
-
end
|
48
|
-
list
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|