treat 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +7 -8
- data/TODO +16 -13
- data/examples/keywords.rb +89 -1
- data/lib/treat/buildable.rb +1 -8
- data/lib/treat/categories.rb +3 -4
- data/lib/treat/category.rb +1 -1
- data/lib/treat/delegatable.rb +1 -1
- data/lib/treat/detectors/encoding/native.rb +5 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
- data/lib/treat/detectors/language/language_detector.rb +4 -0
- data/lib/treat/detectors/language/what_language.rb +4 -4
- data/lib/treat/detectors.rb +1 -1
- data/lib/treat/entities/entity.rb +5 -3
- data/lib/treat/entities/tokens.rb +14 -5
- data/lib/treat/entities/zones.rb +4 -0
- data/lib/treat/entities.rb +7 -5
- data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
- data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
- data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
- data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
- data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
- data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
- data/lib/treat/extractors/time/chronic.rb +8 -0
- data/lib/treat/extractors/time/native.rb +6 -0
- data/lib/treat/extractors/time/nickel.rb +31 -23
- data/lib/treat/extractors/topic_words/lda.rb +21 -16
- data/lib/treat/extractors/topics/reuters.rb +6 -4
- data/lib/treat/extractors.rb +7 -7
- data/lib/treat/formatters/readers/abw.rb +32 -0
- data/lib/treat/formatters/readers/autoselect.rb +13 -11
- data/lib/treat/formatters/readers/doc.rb +13 -0
- data/lib/treat/formatters/readers/gocr.rb +2 -0
- data/lib/treat/formatters/readers/html.rb +21 -1
- data/lib/treat/formatters/readers/ocropus.rb +3 -3
- data/lib/treat/formatters/readers/odt.rb +41 -0
- data/lib/treat/formatters/readers/pdf.rb +5 -2
- data/lib/treat/formatters/readers/txt.rb +2 -0
- data/lib/treat/formatters/serializers/xml.rb +3 -2
- data/lib/treat/formatters/serializers/yaml.rb +2 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
- data/lib/treat/formatters/unserializers/xml.rb +6 -1
- data/lib/treat/formatters/unserializers/yaml.rb +5 -1
- data/lib/treat/formatters/visualizers/dot.rb +35 -37
- data/lib/treat/formatters/visualizers/html.rb +1 -0
- data/lib/treat/formatters/visualizers/inspect.rb +4 -0
- data/lib/treat/formatters/visualizers/short_value.rb +18 -3
- data/lib/treat/formatters/visualizers/standoff.rb +11 -6
- data/lib/treat/formatters/visualizers/tree.rb +5 -1
- data/lib/treat/formatters/visualizers/txt.rb +6 -1
- data/lib/treat/formatters.rb +1 -1
- data/lib/treat/group.rb +4 -3
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
- data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
- data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
- data/lib/treat/inflectors/stem/porter.rb +6 -2
- data/lib/treat/inflectors/stem/porter_c.rb +4 -1
- data/lib/treat/inflectors/stem/uea.rb +4 -4
- data/lib/treat/languages/english/tags.rb +16 -0
- data/lib/treat/languages/english.rb +4 -1
- data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
- data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
- data/lib/treat/lexicalizers/tag/brill.rb +3 -11
- data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
- data/lib/treat/lexicalizers.rb +0 -2
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +3 -17
- data/lib/treat/processors/parsers/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/punkt.rb +1 -0
- data/lib/treat/processors/segmenters/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/tactful.rb +4 -1
- data/lib/treat/processors/tokenizers/punkt.rb +1 -2
- data/lib/treat/processors/tokenizers/stanford.rb +4 -0
- data/lib/treat/processors/tokenizers/tactful.rb +1 -1
- data/lib/treat/processors.rb +4 -4
- data/lib/treat/proxies.rb +18 -11
- data/lib/treat/registrable.rb +12 -5
- data/lib/treat/sugar.rb +8 -3
- data/lib/treat/tree.rb +10 -3
- data/lib/treat.rb +55 -55
- data/test/tc_entity.rb +7 -7
- data/test/tc_extractors.rb +6 -4
- data/test/tc_formatters.rb +0 -4
- data/test/tests.rb +2 -0
- data/test/texts.rb +4 -4
- metadata +48 -56
- data/examples/texts/bugged_out.txt +0 -26
- data/examples/texts/half_cocked_basel.txt +0 -16
- data/examples/texts/hedge_funds.txt +0 -24
- data/examples/texts/hose_and_dry.txt +0 -19
- data/examples/texts/hungarys_troubles.txt +0 -46
- data/examples/texts/indias_slowdown.txt +0 -15
- data/examples/texts/merkozy_rides_again.txt +0 -24
- data/examples/texts/prada_is_not_walmart.txt +0 -9
- data/examples/texts/republican_nomination.txt +0 -26
- data/examples/texts/to_infinity_and_beyond.txt +0 -15
- data/lib/treat/entities/text.rb +0 -7
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
- data/lib/treat/formatters/cleaners/html.rb +0 -17
data/LICENSE
CHANGED
@@ -18,11 +18,10 @@ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
|
|
18
18
|
Non-trivial amount of code has been incorporated and modified from
|
19
19
|
other libraries, specifically for the following files:
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
- tree.rb - Partyl based on work by
|
21
|
+
- processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
|
22
|
+
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
23
|
+
- processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
|
24
|
+
- processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
|
25
|
+
- extractors/topics/reuters.rb - Mark Watson (GPL license)
|
26
|
+
- inflectors/stemmers/porter.rb - Ray Pereda (No license information)
|
27
|
+
- tree.rb - Partly based on work by Anupam Sengupta (Creative Commons Attribution-ShareAlike Unported v. 3.0)
|
data/TODO
CHANGED
@@ -1,25 +1,26 @@
|
|
1
1
|
## Urgent
|
2
2
|
|
3
|
-
- Linkers
|
3
|
+
- Linkers + documentation
|
4
4
|
- Check taggers for context
|
5
5
|
- Stanford dependencies parse
|
6
|
-
- Enju: test
|
7
6
|
- Ocropus => use better function
|
8
7
|
- Optimize magic methods... is_token? type methods, phrase categories.
|
9
|
-
- Move statistics
|
8
|
+
- Move statistics
|
10
9
|
- Synset class move
|
11
10
|
- general procedure for options, check that user doesn't want to change options...
|
12
|
-
-
|
13
|
-
- Check for # Fix everywhere
|
14
|
-
- Check paths; parse bin paths
|
15
|
-
- Ferret, Natural Inputs
|
16
|
-
- Use consistently delegate
|
17
|
-
- Text becomes section
|
18
|
-
- Remove top level
|
11
|
+
- Ferret, Spider
|
19
12
|
- Loading multiple JARs
|
13
|
+
- Linguistics loader, stanford loader
|
20
14
|
- Tokenized sentences are not parsed
|
21
|
-
-
|
22
|
-
-
|
15
|
+
- Dot colors
|
16
|
+
- Fix encoders
|
17
|
+
- Fix Punkt segmenter training text.
|
18
|
+
- Mark Watson's text extractor
|
19
|
+
- Statistics position in
|
20
|
+
- Fix documentation antiword, Graphviz, # encoding: utf-8
|
21
|
+
- Shortcut methods.. pre/postprocessors
|
22
|
+
- Only Phrase..
|
23
|
+
- Frequency in
|
23
24
|
|
24
25
|
## Eventually
|
25
26
|
|
@@ -52,4 +53,6 @@
|
|
52
53
|
- String type detector for other languages
|
53
54
|
- Automatic benchmark
|
54
55
|
- Raspell spell checker
|
55
|
-
- Multithreading
|
56
|
+
- Multithreading
|
57
|
+
- Mark Watson's Java NLP utility to identify proper nouns (human names and places) in text
|
58
|
+
- FastTag a Java fast part of speech tagger.
|
data/examples/keywords.rb
CHANGED
@@ -57,4 +57,92 @@ c.each_document do |d|
|
|
57
57
|
end
|
58
58
|
end
|
59
59
|
|
60
|
-
end
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
Treat.edulcorate
|
65
|
+
Treat.bin = '/ruby/nat/bin'
|
66
|
+
|
67
|
+
c = Collection 'economist'
|
68
|
+
c.each_document { |doc| doc.chunk.segment.tokenize }
|
69
|
+
|
70
|
+
topic_words = c.topic_words(
|
71
|
+
:lda,
|
72
|
+
:topics => 5,
|
73
|
+
:words_per_topic => 5,
|
74
|
+
:iterations => 20
|
75
|
+
)
|
76
|
+
|
77
|
+
keywords = c.keywords(
|
78
|
+
:topics_frequency,
|
79
|
+
:topic_words => topic_words,
|
80
|
+
:tf_idf_threshold => 180
|
81
|
+
)
|
82
|
+
|
83
|
+
puts keywords.inspect
|
84
|
+
|
85
|
+
abort
|
86
|
+
|
87
|
+
c = Phrase 'a test clause'
|
88
|
+
c.parse
|
89
|
+
puts c.visualize(:tree)
|
90
|
+
puts c.visualize(:inspect)
|
91
|
+
puts c.visualize(:short_value)
|
92
|
+
puts c.visualize(:standoff)
|
93
|
+
puts c.visualize(:tree)
|
94
|
+
|
95
|
+
c.serialize(:yaml).save('test.yml')
|
96
|
+
c.serialize(:xml).save('test.xml')
|
97
|
+
|
98
|
+
d = Phrase 'test.yml'
|
99
|
+
d.print_tree
|
100
|
+
d = Phrase 'test.xml'
|
101
|
+
d.print_tree
|
102
|
+
|
103
|
+
puts d.words[0].position_in_parent
|
104
|
+
abort
|
105
|
+
|
106
|
+
w = Word 'running'
|
107
|
+
puts w.stem(:porter_c)
|
108
|
+
puts w.stem(:porter)
|
109
|
+
puts w.stem(:uea)
|
110
|
+
|
111
|
+
w = Word 'run'
|
112
|
+
|
113
|
+
puts w.infinitive(:linguistics)
|
114
|
+
puts w.present_participle(:linguistics)
|
115
|
+
puts w.plural(:linguistics)
|
116
|
+
|
117
|
+
w = Word 'table'
|
118
|
+
|
119
|
+
puts w.synonyms.inspect
|
120
|
+
puts w.antonyms.inspect
|
121
|
+
puts w.hyponyms.inspect
|
122
|
+
puts w.hypernyms.inspect
|
123
|
+
|
124
|
+
n = Number 2
|
125
|
+
puts n.ordinal_words(:linguistics)
|
126
|
+
puts n.cardinal_words(:linguistics)
|
127
|
+
|
128
|
+
s = Sentence 'A sentence to parse.'
|
129
|
+
s.dup.parse(:enju).print_tree
|
130
|
+
s.dup.parse(:stanford).print_tree
|
131
|
+
|
132
|
+
s = Sentence 'A sentence to tokenize'
|
133
|
+
s.dup.tokenize(:macintyre).print_tree
|
134
|
+
s.dup.tokenize(:multilingual).print_tree
|
135
|
+
s.dup.tokenize(:perl).print_tree
|
136
|
+
s.dup.tokenize(:punkt).print_tree
|
137
|
+
s.dup.tokenize(:stanford).print_tree
|
138
|
+
s.dup.tokenize(:tactful).print_tree
|
139
|
+
|
140
|
+
|
141
|
+
=begin
|
142
|
+
c = Collection 'economist'
|
143
|
+
# c.each_document { |d| d.chunk.segment.tokenize }
|
144
|
+
c.documents[0].chunk.segment
|
145
|
+
c.sentences[0].parse(:enju)
|
146
|
+
c.each_word { |word| word.stem }
|
147
|
+
c.visualize(:dot, features: [:tag]).save('test.dot')
|
148
|
+
=end
|
data/lib/treat/buildable.rb
CHANGED
@@ -24,9 +24,8 @@ module Treat
|
|
24
24
|
"Cannot create a document or collection from " +
|
25
25
|
"a string (need a readable file/folder)."
|
26
26
|
end
|
27
|
-
string = string.to_s
|
28
27
|
dot = string.count('.') + string.count('!') + string.count('?')
|
29
|
-
return Treat::Entities::
|
28
|
+
return Treat::Entities::Section.new(string) if dot > 1 ||
|
30
29
|
(string.count("\n") > 0 && dot == 1)
|
31
30
|
return Treat::Entities::Sentence.new(string) if dot == 1 && string.size > 5
|
32
31
|
if string.count(' ') == 0
|
@@ -99,12 +98,6 @@ module Treat
|
|
99
98
|
d.read
|
100
99
|
end
|
101
100
|
def from_serialized_file(file)
|
102
|
-
unless [Treat::Entities::Document,
|
103
|
-
Treat::Entities::Collection].include?(self)
|
104
|
-
raise Treat::Exception,
|
105
|
-
"Cannot create something else than a " +
|
106
|
-
"document from raw file '#{file}'."
|
107
|
-
end
|
108
101
|
d = Treat::Entities::Document.new(file)
|
109
102
|
d.unserialize
|
110
103
|
d.children[0].set_as_root!
|
data/lib/treat/categories.rb
CHANGED
@@ -1,17 +1,16 @@
|
|
1
1
|
module Treat
|
2
2
|
# This module keeps track of all categories that
|
3
|
-
# exist and the methods they implement
|
4
|
-
# responsible for including the categories.
|
3
|
+
# exist and the methods they implement.
|
5
4
|
module Categories
|
6
|
-
# A list of categories.
|
7
5
|
class << self; attr_accessor :list; end
|
6
|
+
# Array - list of all categories.
|
8
7
|
self.list = []
|
9
8
|
# Boolean - does any of the categories have
|
10
9
|
# a method that corresponds to sym?
|
11
10
|
def self.have_method?(sym); methods.include?(sym); end
|
12
11
|
# Cache the list of methods once it has been computed.
|
13
12
|
@@methods = []
|
14
|
-
#
|
13
|
+
# Array - provide a list of all methods implemented
|
15
14
|
# by all Treat categories.
|
16
15
|
def self.methods
|
17
16
|
return @@methods unless @@methods.empty?
|
data/lib/treat/category.rb
CHANGED
@@ -12,7 +12,7 @@ module Treat
|
|
12
12
|
groups.each do |group|
|
13
13
|
group = const_get(group)
|
14
14
|
group.targets.each do |entity_type|
|
15
|
-
entity = Entities.const_get(cc(entity_type))
|
15
|
+
entity = Treat::Entities.const_get(cc(entity_type))
|
16
16
|
entity.class_eval { add_delegators group }
|
17
17
|
end
|
18
18
|
end
|
data/lib/treat/delegatable.rb
CHANGED
@@ -46,7 +46,7 @@ module Treat
|
|
46
46
|
delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
|
47
47
|
result = entity.accept(group, delegate_klass, m, options)
|
48
48
|
if decorator
|
49
|
-
result = group.send(decorator,
|
49
|
+
result = group.send(decorator, entity, result)
|
50
50
|
end
|
51
51
|
if group.type == :annotator
|
52
52
|
f = decorator.nil? ? m : decorator
|
@@ -1,7 +1,12 @@
|
|
1
1
|
module Treat
|
2
2
|
module Detectors
|
3
3
|
module Encoding
|
4
|
+
# A wrapper class for Ruby's native encoding detector.
|
4
5
|
class Native
|
6
|
+
# Return the encoding of the entity according
|
7
|
+
# to the Ruby interpreter.
|
8
|
+
#
|
9
|
+
# Options: none.
|
5
10
|
def self.encoding(entity, options={})
|
6
11
|
entity.value.encoding.name.
|
7
12
|
gsub('-', '_').downcase.intern
|
@@ -6,9 +6,8 @@ module Treat
|
|
6
6
|
# A wrapper for the 'rchardet19' gem, which
|
7
7
|
# detects the encoding of a file.
|
8
8
|
class RChardet19
|
9
|
-
# Returns
|
10
|
-
#
|
11
|
-
# text value.
|
9
|
+
# Returns the encoding of the entity according
|
10
|
+
# to the 'rchardet19' gem.
|
12
11
|
#
|
13
12
|
# Options: none.
|
14
13
|
def self.encoding(entity, options={})
|
@@ -1,6 +1,10 @@
|
|
1
1
|
module Treat
|
2
2
|
module Detectors
|
3
3
|
module Language
|
4
|
+
# A generic language detector, which is called before
|
5
|
+
# any language detector and ensures that configuration
|
6
|
+
# options concerning language are enforced (e.g. returns
|
7
|
+
# the default language when Treat.detect_language is false).
|
4
8
|
class LanguageDetector
|
5
9
|
def self.language(entity, options = {})
|
6
10
|
if Treat.detect_language == false
|
@@ -7,7 +7,7 @@ module Treat
|
|
7
7
|
# performs probabilistic language detection.
|
8
8
|
class WhatLanguage < LanguageDetector
|
9
9
|
# Keep only once instance of the gem class.
|
10
|
-
@@
|
10
|
+
@@detector = nil
|
11
11
|
# Detect the language of an entity using the
|
12
12
|
# 'whatlanguage' gem. Return an identifier
|
13
13
|
# corresponding to the ISO-639-2 code for the
|
@@ -15,10 +15,10 @@ module Treat
|
|
15
15
|
def self.language(entity, options = {})
|
16
16
|
predetection = super(entity, options)
|
17
17
|
return predetection if predetection
|
18
|
-
@@
|
19
|
-
|
18
|
+
@@detector ||= ::WhatLanguage.new(:possibilities)
|
19
|
+
possibilities = @@detector.process_text(entity.to_s)
|
20
20
|
lang = {}
|
21
|
-
|
21
|
+
possibilities.each do |k,v|
|
22
22
|
lang[Treat::Languages.find(k)] = v
|
23
23
|
end
|
24
24
|
Treat::Feature.new(lang).best
|
data/lib/treat/detectors.rb
CHANGED
@@ -2,7 +2,7 @@ module Treat
|
|
2
2
|
# Detectors detect a specific meta-information about
|
3
3
|
# an entity, such as encoding, format and language.
|
4
4
|
#
|
5
|
-
# Detectors are language-independent, and thus
|
5
|
+
# Detectors are language-independent, and thus there
|
6
6
|
# are default algorithms specified for each of them.
|
7
7
|
module Detectors
|
8
8
|
# Group for algorithms that detect encoding.
|
@@ -43,7 +43,7 @@ module Treat
|
|
43
43
|
# feature does not exist
|
44
44
|
def method_missing(sym, *args, &block)
|
45
45
|
return self.build(*args) if sym == nil
|
46
|
-
if !@features
|
46
|
+
if !@features.has_key?(sym)
|
47
47
|
r = parse_magic_method(sym, *args, &block)
|
48
48
|
if r == :no_magic
|
49
49
|
begin
|
@@ -168,7 +168,10 @@ module Treat
|
|
168
168
|
def <<(entities, clear_parent = true)
|
169
169
|
entities = [entities] unless entities.is_a? Array
|
170
170
|
entities.each do |entity|
|
171
|
-
|
171
|
+
if entity.is_a?(Treat::Entities::Token) ||
|
172
|
+
entity.is_a?(Treat::Entities::Constituent)
|
173
|
+
register_token(entity) unless entity.value == ''
|
174
|
+
end
|
172
175
|
end
|
173
176
|
super(entities)
|
174
177
|
@parent.value = '' if has_parent?
|
@@ -211,7 +214,6 @@ module Treat
|
|
211
214
|
def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
|
212
215
|
# Convenience functions. Convenience decorators.
|
213
216
|
def frequency_of(word); statistics(:frequency_of, value: word); end
|
214
|
-
|
215
217
|
private
|
216
218
|
# Return the first element in the array, warning if not
|
217
219
|
# the only one in the array. Used for magic methods: e.g.,
|
@@ -4,15 +4,24 @@ module Treat
|
|
4
4
|
class Token < Entity
|
5
5
|
# All tokens are leafs.
|
6
6
|
def is_leaf?; true; end
|
7
|
-
|
7
|
+
# Convenience function for statistics.
|
8
|
+
def frequency; statistics(:frequency_in); end
|
9
|
+
def frequency_in(type); statistics(:frequency_in, type: type); end
|
10
|
+
def position_in(type); statistics(:position_in_parent); end
|
11
|
+
def tf_idf; statistics(:tf_idf); end
|
8
12
|
end
|
9
13
|
# Represents a word.
|
10
14
|
class Word < Token
|
11
|
-
|
12
|
-
def
|
13
|
-
|
14
|
-
def
|
15
|
+
# Convenience function for conjugations.
|
16
|
+
def infinitive(conjugator = nil); conjugations(conjugator, :mode => :infinitive); end
|
17
|
+
# Convenience function for conjugations.
|
18
|
+
def present_participle(conjugator = nil); conjugations(conjugator, :tense => :present, :mode => :participle); end
|
19
|
+
# Convenience function for declensions.
|
20
|
+
def plural(declensor = nil); declensions(declensor, :count => :plural); end
|
21
|
+
# Convenience function for declensions.
|
22
|
+
def singular(declensor = nil); declensions(declensor, :count => :singular); end
|
15
23
|
end
|
24
|
+
# Represents a clitic ('s).
|
16
25
|
class Clitic < Token
|
17
26
|
end
|
18
27
|
# Represents a number.
|
data/lib/treat/entities/zones.rb
CHANGED
data/lib/treat/entities.rb
CHANGED
@@ -14,7 +14,6 @@ module Treat
|
|
14
14
|
# Then require all possible entities.
|
15
15
|
require 'treat/entities/collection'
|
16
16
|
require 'treat/entities/document'
|
17
|
-
require 'treat/entities/text'
|
18
17
|
require 'treat/entities/zones'
|
19
18
|
require 'treat/entities/sentence'
|
20
19
|
require 'treat/entities/constituents'
|
@@ -25,9 +24,11 @@ module Treat
|
|
25
24
|
const_get(entity).build(value, id)
|
26
25
|
end
|
27
26
|
end
|
27
|
+
# Cache a list of defined entity types to
|
28
|
+
# improve performance.
|
29
|
+
@@list = []
|
28
30
|
# Provide a list of defined entity types,
|
29
31
|
# as non-camel case identifiers.
|
30
|
-
@@list = []
|
31
32
|
def self.list
|
32
33
|
return @@list unless @@list.empty?
|
33
34
|
self.constants.each do |constant|
|
@@ -35,16 +36,17 @@ module Treat
|
|
35
36
|
end
|
36
37
|
@@list
|
37
38
|
end
|
38
|
-
# Return the
|
39
|
-
#
|
39
|
+
# Return the hierarchy level of the entity
|
40
|
+
# class, the minimum being a Token and the
|
41
|
+
# maximum being a Collection.
|
40
42
|
def self.rank(type)
|
41
43
|
klass = Entities.const_get(cc(type))
|
42
44
|
compare = lambda { |a,b| a == b || a < b }
|
43
45
|
return 0 if compare.call(klass, Token)
|
44
46
|
return 1 if compare.call(klass, Constituent)
|
45
47
|
return 2 if compare.call(klass, Sentence)
|
48
|
+
return 3 if compare.call(klass, Zone)
|
46
49
|
return 4 if compare.call(klass, Document)
|
47
|
-
return 3 if compare.call(klass, Section)
|
48
50
|
return 5 if compare.call(klass, Collection)
|
49
51
|
end
|
50
52
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Keywords
|
4
|
+
class TopicsFrequency
|
5
|
+
DefaultOptions = {tf_idf_threshold: 180, topic_words: nil}
|
6
|
+
def self.keywords(entity, options = {})
|
7
|
+
options = DefaultOptions.merge(options)
|
8
|
+
unless options[:topic_words]
|
9
|
+
raise Treat::Exception, "You must supply topic words."
|
10
|
+
end
|
11
|
+
if Treat::Entities.rank(entity.type) <
|
12
|
+
Treat::Entities.rank(:sentence)
|
13
|
+
raise Treat::Exception, 'Cannot get the key ' +
|
14
|
+
'sentences of an entity smaller than a sentence.'
|
15
|
+
else
|
16
|
+
find_keywords(entity, options)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
def self.find_keywords(entity, options)
|
20
|
+
keywords = []
|
21
|
+
entity.each_word do |word|
|
22
|
+
found = false
|
23
|
+
options[:topic_words].each do |i, topic_words|
|
24
|
+
next if keywords.include?(word.value)
|
25
|
+
if topic_words.include?(word.value)
|
26
|
+
found = true
|
27
|
+
tf_idf = word.tf_idf
|
28
|
+
if tf_idf < options[:tf_idf_threshold]
|
29
|
+
keywords << word.value
|
30
|
+
word.set :is_keyword?, found
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
keywords
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -1,15 +1,16 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
-
class
|
5
|
-
|
4
|
+
class FrequencyIn
|
5
|
+
DefaultOptions = {type: nil}
|
6
6
|
def self.statistics(entity, options={})
|
7
|
+
options = DefaultOptions.merge(options)
|
7
8
|
if entity.is_leaf?
|
8
9
|
w = entity.value.downcase
|
9
|
-
if entity.token_registry[:value][w].nil?
|
10
|
+
if entity.token_registry(options[:type])[:value][w].nil?
|
10
11
|
0
|
11
12
|
else
|
12
|
-
entity.token_registry[:value][w].size
|
13
|
+
entity.token_registry(options[:type])[:value][w].size
|
13
14
|
end
|
14
15
|
else
|
15
16
|
raise Treat::Exception,
|
@@ -5,11 +5,9 @@ module Treat
|
|
5
5
|
# Find the frequency of a given string value.
|
6
6
|
def self.statistics(entity, options = {})
|
7
7
|
w = options[:value]
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
entity.token_registry[:value][w].size
|
12
|
-
end
|
8
|
+
raise Treat::Exception, "Must supply a non-nil value." unless w
|
9
|
+
entity.token_registry[:value][w].nil? ? 0 :
|
10
|
+
entity.token_registry[:value][w].size
|
13
11
|
end
|
14
12
|
end
|
15
13
|
end
|
@@ -1,11 +1,12 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
-
class
|
4
|
+
class PositionInParent
|
5
5
|
# Find the position of the current entity
|
6
6
|
# inside the parent entity with type entity_type.
|
7
|
-
|
8
|
-
|
7
|
+
# Not implemented.
|
8
|
+
def self.statistics(entity, options = {})
|
9
|
+
entity.parent.children.index(entity)
|
9
10
|
end
|
10
11
|
end
|
11
12
|
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Statistics
|
4
|
+
# "The term count in the given document is simply the
|
5
|
+
# number of times a given term appears in that document.
|
6
|
+
# This count is usually normalized to prevent a bias
|
7
|
+
# towards longer documents (which may have a higher
|
8
|
+
# term count regardless of the actual importance of
|
9
|
+
# that term in the document) to give a measure of the
|
10
|
+
# importance of the term t within the particular document d.
|
11
|
+
# Thus we have the term frequency tf(t,d), defined in the
|
12
|
+
# simplest case as the occurrence count of a term in a document.
|
13
|
+
#
|
14
|
+
# The inverse document frequency is a measure of the general
|
15
|
+
# importance of the term (obtained by dividing the total number
|
16
|
+
# of documents by the number of documents containing the term,
|
17
|
+
# and then taking the logarithm of that quotient)."
|
18
|
+
#
|
19
|
+
# (From Wikipedia)
|
20
|
+
class TfIdf
|
21
|
+
DefaultOptions = { type: nil }
|
22
|
+
def self.statistics(entity, options={})
|
23
|
+
tf = entity.frequency_in(:document)
|
24
|
+
tf = tf / entity.root.word_count
|
25
|
+
d = entity.root.document_count
|
26
|
+
i = 0
|
27
|
+
entity.root.each_document do |document|
|
28
|
+
i += 1 if document.frequency_of(entity.value)
|
29
|
+
end
|
30
|
+
idf = ::Math.log(d.to_f/(i.to_f + 1)).abs
|
31
|
+
tf.to_f/idf.to_f
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -1,23 +1,23 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
+
# Experimental algorithm to generate transition matrices.
|
4
5
|
class TransitionMatrix
|
5
|
-
|
6
|
+
DefaultOptions = {
|
7
|
+
normalize: true,
|
8
|
+
features: [:tag],
|
9
|
+
condition: lambda { |e| true },
|
10
|
+
entity_types: [:word],
|
11
|
+
relationships: [:parent, :right, :children]
|
12
|
+
}
|
6
13
|
# Find the transition matrix.
|
7
14
|
def self.statistics(entity, options={})
|
8
|
-
|
9
|
-
normalize = options[:normalize] || true
|
10
|
-
features = options[:features] || [:tag]
|
11
|
-
condition = options[:condition] || lambda { |e| true }
|
12
|
-
entity_types = options[:entity_types] ? options[:entity_types] :
|
13
|
-
[options[:entity_type]]
|
14
|
-
relationships = options[:relationships] ||
|
15
|
-
[:parent, :left, :right, :children]
|
15
|
+
options = DefaultOptions.merge(options)
|
16
16
|
|
17
17
|
# Create lambdas to generate the arrays.
|
18
|
-
empty_prototype = {}; features.each { |f| empty_prototype[f] = {} }
|
18
|
+
empty_prototype = {}; options[:features].each { |f| empty_prototype[f] = {} }
|
19
19
|
empty = lambda { Marshal.load(Marshal.dump(empty_prototype)) }
|
20
|
-
empty2_prototype = {}; relationships.each { |r| empty2_prototype[r] = empty.call }
|
20
|
+
empty2_prototype = {}; options[:relationships].each { |r| empty2_prototype[r] = empty.call }
|
21
21
|
empty2 = lambda { Marshal.load(Marshal.dump(empty2_prototype)) }
|
22
22
|
|
23
23
|
# Deep (recursive) merger.
|
@@ -27,24 +27,25 @@ module Treat
|
|
27
27
|
|
28
28
|
# Master matrix.
|
29
29
|
mm = nil
|
30
|
+
tm = empty.call
|
30
31
|
|
31
|
-
entity.each_entity(*entity_types) do |target|
|
32
|
-
|
33
|
-
next unless condition.call(target)
|
32
|
+
entity.each_entity(*options[:entity_types]) do |target|
|
33
|
+
|
34
|
+
next unless options[:condition].call(target)
|
34
35
|
|
35
36
|
# Initialize the empty transition matrix.
|
36
|
-
|
37
|
+
|
37
38
|
|
38
39
|
# Calculate the transition probabilities.
|
39
|
-
features.each do |f1|
|
40
|
+
options[:features].each do |f1|
|
40
41
|
|
41
42
|
v1 = target.send(f1)
|
42
43
|
tm[f1][v1] = empty2.call
|
43
44
|
|
44
|
-
relationships.each do |relationship|
|
45
|
+
options[:relationships].each do |relationship|
|
45
46
|
tm[f1][v1][relationship] = empty.call
|
46
|
-
|
47
|
-
features.each do |f2|
|
47
|
+
|
48
|
+
options[:features].each do |f2|
|
48
49
|
relatives = target.send(relationship)
|
49
50
|
relatives = [relatives] unless relatives.is_a? Array
|
50
51
|
relatives.each do |relative|
|
@@ -55,9 +56,9 @@ module Treat
|
|
55
56
|
tm[f1][v1][relationship][f2][v2] += 1.0
|
56
57
|
end
|
57
58
|
end
|
58
|
-
|
59
|
+
|
59
60
|
tm[f1][v1][:edge] = empty.call
|
60
|
-
|
61
|
+
|
61
62
|
target.edges.each do |id, edge_type|
|
62
63
|
s = target.ancestor_with_type :sentence
|
63
64
|
if s
|
@@ -68,14 +69,13 @@ module Treat
|
|
68
69
|
tm[f1][v1][:edge][f2][v2] += 1.0
|
69
70
|
end
|
70
71
|
end
|
71
|
-
|
72
|
+
|
72
73
|
end
|
73
74
|
end
|
74
75
|
end
|
75
|
-
|
76
|
-
mm = mm ? mm.merge(tm, &merger) : tm
|
77
76
|
end
|
78
|
-
|
77
|
+
mm = mm ? mm.merge(tm, &merger) : tm
|
78
|
+
if options[:normalize]
|
79
79
|
normalize(mm)
|
80
80
|
else
|
81
81
|
mm
|