treat 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +7 -8
- data/TODO +16 -13
- data/examples/keywords.rb +89 -1
- data/lib/treat/buildable.rb +1 -8
- data/lib/treat/categories.rb +3 -4
- data/lib/treat/category.rb +1 -1
- data/lib/treat/delegatable.rb +1 -1
- data/lib/treat/detectors/encoding/native.rb +5 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
- data/lib/treat/detectors/language/language_detector.rb +4 -0
- data/lib/treat/detectors/language/what_language.rb +4 -4
- data/lib/treat/detectors.rb +1 -1
- data/lib/treat/entities/entity.rb +5 -3
- data/lib/treat/entities/tokens.rb +14 -5
- data/lib/treat/entities/zones.rb +4 -0
- data/lib/treat/entities.rb +7 -5
- data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
- data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
- data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
- data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
- data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
- data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
- data/lib/treat/extractors/time/chronic.rb +8 -0
- data/lib/treat/extractors/time/native.rb +6 -0
- data/lib/treat/extractors/time/nickel.rb +31 -23
- data/lib/treat/extractors/topic_words/lda.rb +21 -16
- data/lib/treat/extractors/topics/reuters.rb +6 -4
- data/lib/treat/extractors.rb +7 -7
- data/lib/treat/formatters/readers/abw.rb +32 -0
- data/lib/treat/formatters/readers/autoselect.rb +13 -11
- data/lib/treat/formatters/readers/doc.rb +13 -0
- data/lib/treat/formatters/readers/gocr.rb +2 -0
- data/lib/treat/formatters/readers/html.rb +21 -1
- data/lib/treat/formatters/readers/ocropus.rb +3 -3
- data/lib/treat/formatters/readers/odt.rb +41 -0
- data/lib/treat/formatters/readers/pdf.rb +5 -2
- data/lib/treat/formatters/readers/txt.rb +2 -0
- data/lib/treat/formatters/serializers/xml.rb +3 -2
- data/lib/treat/formatters/serializers/yaml.rb +2 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
- data/lib/treat/formatters/unserializers/xml.rb +6 -1
- data/lib/treat/formatters/unserializers/yaml.rb +5 -1
- data/lib/treat/formatters/visualizers/dot.rb +35 -37
- data/lib/treat/formatters/visualizers/html.rb +1 -0
- data/lib/treat/formatters/visualizers/inspect.rb +4 -0
- data/lib/treat/formatters/visualizers/short_value.rb +18 -3
- data/lib/treat/formatters/visualizers/standoff.rb +11 -6
- data/lib/treat/formatters/visualizers/tree.rb +5 -1
- data/lib/treat/formatters/visualizers/txt.rb +6 -1
- data/lib/treat/formatters.rb +1 -1
- data/lib/treat/group.rb +4 -3
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
- data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
- data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
- data/lib/treat/inflectors/stem/porter.rb +6 -2
- data/lib/treat/inflectors/stem/porter_c.rb +4 -1
- data/lib/treat/inflectors/stem/uea.rb +4 -4
- data/lib/treat/languages/english/tags.rb +16 -0
- data/lib/treat/languages/english.rb +4 -1
- data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
- data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
- data/lib/treat/lexicalizers/tag/brill.rb +3 -11
- data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
- data/lib/treat/lexicalizers.rb +0 -2
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +3 -17
- data/lib/treat/processors/parsers/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/punkt.rb +1 -0
- data/lib/treat/processors/segmenters/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/tactful.rb +4 -1
- data/lib/treat/processors/tokenizers/punkt.rb +1 -2
- data/lib/treat/processors/tokenizers/stanford.rb +4 -0
- data/lib/treat/processors/tokenizers/tactful.rb +1 -1
- data/lib/treat/processors.rb +4 -4
- data/lib/treat/proxies.rb +18 -11
- data/lib/treat/registrable.rb +12 -5
- data/lib/treat/sugar.rb +8 -3
- data/lib/treat/tree.rb +10 -3
- data/lib/treat.rb +55 -55
- data/test/tc_entity.rb +7 -7
- data/test/tc_extractors.rb +6 -4
- data/test/tc_formatters.rb +0 -4
- data/test/tests.rb +2 -0
- data/test/texts.rb +4 -4
- metadata +48 -56
- data/examples/texts/bugged_out.txt +0 -26
- data/examples/texts/half_cocked_basel.txt +0 -16
- data/examples/texts/hedge_funds.txt +0 -24
- data/examples/texts/hose_and_dry.txt +0 -19
- data/examples/texts/hungarys_troubles.txt +0 -46
- data/examples/texts/indias_slowdown.txt +0 -15
- data/examples/texts/merkozy_rides_again.txt +0 -24
- data/examples/texts/prada_is_not_walmart.txt +0 -9
- data/examples/texts/republican_nomination.txt +0 -26
- data/examples/texts/to_infinity_and_beyond.txt +0 -15
- data/lib/treat/entities/text.rb +0 -7
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
- data/lib/treat/formatters/cleaners/html.rb +0 -17
data/LICENSE
CHANGED
@@ -18,11 +18,10 @@ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
|
|
18
18
|
Non-trivial amount of code has been incorporated and modified from
|
19
19
|
other libraries, specifically for the following files:
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
- tree.rb - Partyl based on work by
|
21
|
+
- processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
|
22
|
+
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
23
|
+
- processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
|
24
|
+
- processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
|
25
|
+
- extractors/topics/reuters.rb - Mark Watson (GPL license)
|
26
|
+
- inflectors/stemmers/porter.rb - Ray Pereda (No license information)
|
27
|
+
- tree.rb - Partly based on work by Anupam Sengupta (Creative Commons Attribution-ShareAlike Unported v. 3.0)
|
data/TODO
CHANGED
@@ -1,25 +1,26 @@
|
|
1
1
|
## Urgent
|
2
2
|
|
3
|
-
- Linkers
|
3
|
+
- Linkers + documentation
|
4
4
|
- Check taggers for context
|
5
5
|
- Stanford dependencies parse
|
6
|
-
- Enju: test
|
7
6
|
- Ocropus => use better function
|
8
7
|
- Optimize magic methods... is_token? type methods, phrase categories.
|
9
|
-
- Move statistics
|
8
|
+
- Move statistics
|
10
9
|
- Synset class move
|
11
10
|
- general procedure for options, check that user doesn't want to change options...
|
12
|
-
-
|
13
|
-
- Check for # Fix everywhere
|
14
|
-
- Check paths; parse bin paths
|
15
|
-
- Ferret, Natural Inputs
|
16
|
-
- Use consistently delegate
|
17
|
-
- Text becomes section
|
18
|
-
- Remove top level
|
11
|
+
- Ferret, Spider
|
19
12
|
- Loading multiple JARs
|
13
|
+
- Linguistics loader, stanford loader
|
20
14
|
- Tokenized sentences are not parsed
|
21
|
-
-
|
22
|
-
-
|
15
|
+
- Dot colors
|
16
|
+
- Fix encoders
|
17
|
+
- Fix Punkt segmenter training text.
|
18
|
+
- Mark Watson's text extractor
|
19
|
+
- Statistics position in
|
20
|
+
- Fix documentation antiword, Graphviz, # encoding: utf-8
|
21
|
+
- Shortcut methods.. pre/postprocessors
|
22
|
+
- Only Phrase..
|
23
|
+
- Frequency in
|
23
24
|
|
24
25
|
## Eventually
|
25
26
|
|
@@ -52,4 +53,6 @@
|
|
52
53
|
- String type detector for other languages
|
53
54
|
- Automatic benchmark
|
54
55
|
- Raspell spell checker
|
55
|
-
- Multithreading
|
56
|
+
- Multithreading
|
57
|
+
- Mark Watson's Java NLP utility to identify proper nouns (human names and places) in text
|
58
|
+
- FastTag a Java fast part of speech tagger.
|
data/examples/keywords.rb
CHANGED
@@ -57,4 +57,92 @@ c.each_document do |d|
|
|
57
57
|
end
|
58
58
|
end
|
59
59
|
|
60
|
-
end
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
Treat.edulcorate
|
65
|
+
Treat.bin = '/ruby/nat/bin'
|
66
|
+
|
67
|
+
c = Collection 'economist'
|
68
|
+
c.each_document { |doc| doc.chunk.segment.tokenize }
|
69
|
+
|
70
|
+
topic_words = c.topic_words(
|
71
|
+
:lda,
|
72
|
+
:topics => 5,
|
73
|
+
:words_per_topic => 5,
|
74
|
+
:iterations => 20
|
75
|
+
)
|
76
|
+
|
77
|
+
keywords = c.keywords(
|
78
|
+
:topics_frequency,
|
79
|
+
:topic_words => topic_words,
|
80
|
+
:tf_idf_threshold => 180
|
81
|
+
)
|
82
|
+
|
83
|
+
puts keywords.inspect
|
84
|
+
|
85
|
+
abort
|
86
|
+
|
87
|
+
c = Phrase 'a test clause'
|
88
|
+
c.parse
|
89
|
+
puts c.visualize(:tree)
|
90
|
+
puts c.visualize(:inspect)
|
91
|
+
puts c.visualize(:short_value)
|
92
|
+
puts c.visualize(:standoff)
|
93
|
+
puts c.visualize(:tree)
|
94
|
+
|
95
|
+
c.serialize(:yaml).save('test.yml')
|
96
|
+
c.serialize(:xml).save('test.xml')
|
97
|
+
|
98
|
+
d = Phrase 'test.yml'
|
99
|
+
d.print_tree
|
100
|
+
d = Phrase 'test.xml'
|
101
|
+
d.print_tree
|
102
|
+
|
103
|
+
puts d.words[0].position_in_parent
|
104
|
+
abort
|
105
|
+
|
106
|
+
w = Word 'running'
|
107
|
+
puts w.stem(:porter_c)
|
108
|
+
puts w.stem(:porter)
|
109
|
+
puts w.stem(:uea)
|
110
|
+
|
111
|
+
w = Word 'run'
|
112
|
+
|
113
|
+
puts w.infinitive(:linguistics)
|
114
|
+
puts w.present_participle(:linguistics)
|
115
|
+
puts w.plural(:linguistics)
|
116
|
+
|
117
|
+
w = Word 'table'
|
118
|
+
|
119
|
+
puts w.synonyms.inspect
|
120
|
+
puts w.antonyms.inspect
|
121
|
+
puts w.hyponyms.inspect
|
122
|
+
puts w.hypernyms.inspect
|
123
|
+
|
124
|
+
n = Number 2
|
125
|
+
puts n.ordinal_words(:linguistics)
|
126
|
+
puts n.cardinal_words(:linguistics)
|
127
|
+
|
128
|
+
s = Sentence 'A sentence to parse.'
|
129
|
+
s.dup.parse(:enju).print_tree
|
130
|
+
s.dup.parse(:stanford).print_tree
|
131
|
+
|
132
|
+
s = Sentence 'A sentence to tokenize'
|
133
|
+
s.dup.tokenize(:macintyre).print_tree
|
134
|
+
s.dup.tokenize(:multilingual).print_tree
|
135
|
+
s.dup.tokenize(:perl).print_tree
|
136
|
+
s.dup.tokenize(:punkt).print_tree
|
137
|
+
s.dup.tokenize(:stanford).print_tree
|
138
|
+
s.dup.tokenize(:tactful).print_tree
|
139
|
+
|
140
|
+
|
141
|
+
=begin
|
142
|
+
c = Collection 'economist'
|
143
|
+
# c.each_document { |d| d.chunk.segment.tokenize }
|
144
|
+
c.documents[0].chunk.segment
|
145
|
+
c.sentences[0].parse(:enju)
|
146
|
+
c.each_word { |word| word.stem }
|
147
|
+
c.visualize(:dot, features: [:tag]).save('test.dot')
|
148
|
+
=end
|
data/lib/treat/buildable.rb
CHANGED
@@ -24,9 +24,8 @@ module Treat
|
|
24
24
|
"Cannot create a document or collection from " +
|
25
25
|
"a string (need a readable file/folder)."
|
26
26
|
end
|
27
|
-
string = string.to_s
|
28
27
|
dot = string.count('.') + string.count('!') + string.count('?')
|
29
|
-
return Treat::Entities::
|
28
|
+
return Treat::Entities::Section.new(string) if dot > 1 ||
|
30
29
|
(string.count("\n") > 0 && dot == 1)
|
31
30
|
return Treat::Entities::Sentence.new(string) if dot == 1 && string.size > 5
|
32
31
|
if string.count(' ') == 0
|
@@ -99,12 +98,6 @@ module Treat
|
|
99
98
|
d.read
|
100
99
|
end
|
101
100
|
def from_serialized_file(file)
|
102
|
-
unless [Treat::Entities::Document,
|
103
|
-
Treat::Entities::Collection].include?(self)
|
104
|
-
raise Treat::Exception,
|
105
|
-
"Cannot create something else than a " +
|
106
|
-
"document from raw file '#{file}'."
|
107
|
-
end
|
108
101
|
d = Treat::Entities::Document.new(file)
|
109
102
|
d.unserialize
|
110
103
|
d.children[0].set_as_root!
|
data/lib/treat/categories.rb
CHANGED
@@ -1,17 +1,16 @@
|
|
1
1
|
module Treat
|
2
2
|
# This module keeps track of all categories that
|
3
|
-
# exist and the methods they implement
|
4
|
-
# responsible for including the categories.
|
3
|
+
# exist and the methods they implement.
|
5
4
|
module Categories
|
6
|
-
# A list of categories.
|
7
5
|
class << self; attr_accessor :list; end
|
6
|
+
# Array - list of all categories.
|
8
7
|
self.list = []
|
9
8
|
# Boolean - does any of the categories have
|
10
9
|
# a method that corresponds to sym?
|
11
10
|
def self.have_method?(sym); methods.include?(sym); end
|
12
11
|
# Cache the list of methods once it has been computed.
|
13
12
|
@@methods = []
|
14
|
-
#
|
13
|
+
# Array - provide a list of all methods implemented
|
15
14
|
# by all Treat categories.
|
16
15
|
def self.methods
|
17
16
|
return @@methods unless @@methods.empty?
|
data/lib/treat/category.rb
CHANGED
@@ -12,7 +12,7 @@ module Treat
|
|
12
12
|
groups.each do |group|
|
13
13
|
group = const_get(group)
|
14
14
|
group.targets.each do |entity_type|
|
15
|
-
entity = Entities.const_get(cc(entity_type))
|
15
|
+
entity = Treat::Entities.const_get(cc(entity_type))
|
16
16
|
entity.class_eval { add_delegators group }
|
17
17
|
end
|
18
18
|
end
|
data/lib/treat/delegatable.rb
CHANGED
@@ -46,7 +46,7 @@ module Treat
|
|
46
46
|
delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
|
47
47
|
result = entity.accept(group, delegate_klass, m, options)
|
48
48
|
if decorator
|
49
|
-
result = group.send(decorator,
|
49
|
+
result = group.send(decorator, entity, result)
|
50
50
|
end
|
51
51
|
if group.type == :annotator
|
52
52
|
f = decorator.nil? ? m : decorator
|
@@ -1,7 +1,12 @@
|
|
1
1
|
module Treat
|
2
2
|
module Detectors
|
3
3
|
module Encoding
|
4
|
+
# A wrapper class for Ruby's native encoding detector.
|
4
5
|
class Native
|
6
|
+
# Return the encoding of the entity according
|
7
|
+
# to the Ruby interpreter.
|
8
|
+
#
|
9
|
+
# Options: none.
|
5
10
|
def self.encoding(entity, options={})
|
6
11
|
entity.value.encoding.name.
|
7
12
|
gsub('-', '_').downcase.intern
|
@@ -6,9 +6,8 @@ module Treat
|
|
6
6
|
# A wrapper for the 'rchardet19' gem, which
|
7
7
|
# detects the encoding of a file.
|
8
8
|
class RChardet19
|
9
|
-
# Returns
|
10
|
-
#
|
11
|
-
# text value.
|
9
|
+
# Returns the encoding of the entity according
|
10
|
+
# to the 'rchardet19' gem.
|
12
11
|
#
|
13
12
|
# Options: none.
|
14
13
|
def self.encoding(entity, options={})
|
@@ -1,6 +1,10 @@
|
|
1
1
|
module Treat
|
2
2
|
module Detectors
|
3
3
|
module Language
|
4
|
+
# A generic language detector, which is called before
|
5
|
+
# any language detector and ensures that configuration
|
6
|
+
# options concerning language are enforced (e.g. returns
|
7
|
+
# the default language when Treat.detect_language is false).
|
4
8
|
class LanguageDetector
|
5
9
|
def self.language(entity, options = {})
|
6
10
|
if Treat.detect_language == false
|
@@ -7,7 +7,7 @@ module Treat
|
|
7
7
|
# performs probabilistic language detection.
|
8
8
|
class WhatLanguage < LanguageDetector
|
9
9
|
# Keep only once instance of the gem class.
|
10
|
-
@@
|
10
|
+
@@detector = nil
|
11
11
|
# Detect the language of an entity using the
|
12
12
|
# 'whatlanguage' gem. Return an identifier
|
13
13
|
# corresponding to the ISO-639-2 code for the
|
@@ -15,10 +15,10 @@ module Treat
|
|
15
15
|
def self.language(entity, options = {})
|
16
16
|
predetection = super(entity, options)
|
17
17
|
return predetection if predetection
|
18
|
-
@@
|
19
|
-
|
18
|
+
@@detector ||= ::WhatLanguage.new(:possibilities)
|
19
|
+
possibilities = @@detector.process_text(entity.to_s)
|
20
20
|
lang = {}
|
21
|
-
|
21
|
+
possibilities.each do |k,v|
|
22
22
|
lang[Treat::Languages.find(k)] = v
|
23
23
|
end
|
24
24
|
Treat::Feature.new(lang).best
|
data/lib/treat/detectors.rb
CHANGED
@@ -2,7 +2,7 @@ module Treat
|
|
2
2
|
# Detectors detect a specific meta-information about
|
3
3
|
# an entity, such as encoding, format and language.
|
4
4
|
#
|
5
|
-
# Detectors are language-independent, and thus
|
5
|
+
# Detectors are language-independent, and thus there
|
6
6
|
# are default algorithms specified for each of them.
|
7
7
|
module Detectors
|
8
8
|
# Group for algorithms that detect encoding.
|
@@ -43,7 +43,7 @@ module Treat
|
|
43
43
|
# feature does not exist
|
44
44
|
def method_missing(sym, *args, &block)
|
45
45
|
return self.build(*args) if sym == nil
|
46
|
-
if !@features
|
46
|
+
if !@features.has_key?(sym)
|
47
47
|
r = parse_magic_method(sym, *args, &block)
|
48
48
|
if r == :no_magic
|
49
49
|
begin
|
@@ -168,7 +168,10 @@ module Treat
|
|
168
168
|
def <<(entities, clear_parent = true)
|
169
169
|
entities = [entities] unless entities.is_a? Array
|
170
170
|
entities.each do |entity|
|
171
|
-
|
171
|
+
if entity.is_a?(Treat::Entities::Token) ||
|
172
|
+
entity.is_a?(Treat::Entities::Constituent)
|
173
|
+
register_token(entity) unless entity.value == ''
|
174
|
+
end
|
172
175
|
end
|
173
176
|
super(entities)
|
174
177
|
@parent.value = '' if has_parent?
|
@@ -211,7 +214,6 @@ module Treat
|
|
211
214
|
def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
|
212
215
|
# Convenience functions. Convenience decorators.
|
213
216
|
def frequency_of(word); statistics(:frequency_of, value: word); end
|
214
|
-
|
215
217
|
private
|
216
218
|
# Return the first element in the array, warning if not
|
217
219
|
# the only one in the array. Used for magic methods: e.g.,
|
@@ -4,15 +4,24 @@ module Treat
|
|
4
4
|
class Token < Entity
|
5
5
|
# All tokens are leafs.
|
6
6
|
def is_leaf?; true; end
|
7
|
-
|
7
|
+
# Convenience function for statistics.
|
8
|
+
def frequency; statistics(:frequency_in); end
|
9
|
+
def frequency_in(type); statistics(:frequency_in, type: type); end
|
10
|
+
def position_in(type); statistics(:position_in_parent); end
|
11
|
+
def tf_idf; statistics(:tf_idf); end
|
8
12
|
end
|
9
13
|
# Represents a word.
|
10
14
|
class Word < Token
|
11
|
-
|
12
|
-
def
|
13
|
-
|
14
|
-
def
|
15
|
+
# Convenience function for conjugations.
|
16
|
+
def infinitive(conjugator = nil); conjugations(conjugator, :mode => :infinitive); end
|
17
|
+
# Convenience function for conjugations.
|
18
|
+
def present_participle(conjugator = nil); conjugations(conjugator, :tense => :present, :mode => :participle); end
|
19
|
+
# Convenience function for declensions.
|
20
|
+
def plural(declensor = nil); declensions(declensor, :count => :plural); end
|
21
|
+
# Convenience function for declensions.
|
22
|
+
def singular(declensor = nil); declensions(declensor, :count => :singular); end
|
15
23
|
end
|
24
|
+
# Represents a clitic ('s).
|
16
25
|
class Clitic < Token
|
17
26
|
end
|
18
27
|
# Represents a number.
|
data/lib/treat/entities/zones.rb
CHANGED
data/lib/treat/entities.rb
CHANGED
@@ -14,7 +14,6 @@ module Treat
|
|
14
14
|
# Then require all possible entities.
|
15
15
|
require 'treat/entities/collection'
|
16
16
|
require 'treat/entities/document'
|
17
|
-
require 'treat/entities/text'
|
18
17
|
require 'treat/entities/zones'
|
19
18
|
require 'treat/entities/sentence'
|
20
19
|
require 'treat/entities/constituents'
|
@@ -25,9 +24,11 @@ module Treat
|
|
25
24
|
const_get(entity).build(value, id)
|
26
25
|
end
|
27
26
|
end
|
27
|
+
# Cache a list of defined entity types to
|
28
|
+
# improve performance.
|
29
|
+
@@list = []
|
28
30
|
# Provide a list of defined entity types,
|
29
31
|
# as non-camel case identifiers.
|
30
|
-
@@list = []
|
31
32
|
def self.list
|
32
33
|
return @@list unless @@list.empty?
|
33
34
|
self.constants.each do |constant|
|
@@ -35,16 +36,17 @@ module Treat
|
|
35
36
|
end
|
36
37
|
@@list
|
37
38
|
end
|
38
|
-
# Return the
|
39
|
-
#
|
39
|
+
# Return the hierarchy level of the entity
|
40
|
+
# class, the minimum being a Token and the
|
41
|
+
# maximum being a Collection.
|
40
42
|
def self.rank(type)
|
41
43
|
klass = Entities.const_get(cc(type))
|
42
44
|
compare = lambda { |a,b| a == b || a < b }
|
43
45
|
return 0 if compare.call(klass, Token)
|
44
46
|
return 1 if compare.call(klass, Constituent)
|
45
47
|
return 2 if compare.call(klass, Sentence)
|
48
|
+
return 3 if compare.call(klass, Zone)
|
46
49
|
return 4 if compare.call(klass, Document)
|
47
|
-
return 3 if compare.call(klass, Section)
|
48
50
|
return 5 if compare.call(klass, Collection)
|
49
51
|
end
|
50
52
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Keywords
|
4
|
+
class TopicsFrequency
|
5
|
+
DefaultOptions = {tf_idf_threshold: 180, topic_words: nil}
|
6
|
+
def self.keywords(entity, options = {})
|
7
|
+
options = DefaultOptions.merge(options)
|
8
|
+
unless options[:topic_words]
|
9
|
+
raise Treat::Exception, "You must supply topic words."
|
10
|
+
end
|
11
|
+
if Treat::Entities.rank(entity.type) <
|
12
|
+
Treat::Entities.rank(:sentence)
|
13
|
+
raise Treat::Exception, 'Cannot get the key ' +
|
14
|
+
'sentences of an entity smaller than a sentence.'
|
15
|
+
else
|
16
|
+
find_keywords(entity, options)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
def self.find_keywords(entity, options)
|
20
|
+
keywords = []
|
21
|
+
entity.each_word do |word|
|
22
|
+
found = false
|
23
|
+
options[:topic_words].each do |i, topic_words|
|
24
|
+
next if keywords.include?(word.value)
|
25
|
+
if topic_words.include?(word.value)
|
26
|
+
found = true
|
27
|
+
tf_idf = word.tf_idf
|
28
|
+
if tf_idf < options[:tf_idf_threshold]
|
29
|
+
keywords << word.value
|
30
|
+
word.set :is_keyword?, found
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
keywords
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -1,15 +1,16 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
-
class
|
5
|
-
|
4
|
+
class FrequencyIn
|
5
|
+
DefaultOptions = {type: nil}
|
6
6
|
def self.statistics(entity, options={})
|
7
|
+
options = DefaultOptions.merge(options)
|
7
8
|
if entity.is_leaf?
|
8
9
|
w = entity.value.downcase
|
9
|
-
if entity.token_registry[:value][w].nil?
|
10
|
+
if entity.token_registry(options[:type])[:value][w].nil?
|
10
11
|
0
|
11
12
|
else
|
12
|
-
entity.token_registry[:value][w].size
|
13
|
+
entity.token_registry(options[:type])[:value][w].size
|
13
14
|
end
|
14
15
|
else
|
15
16
|
raise Treat::Exception,
|
@@ -5,11 +5,9 @@ module Treat
|
|
5
5
|
# Find the frequency of a given string value.
|
6
6
|
def self.statistics(entity, options = {})
|
7
7
|
w = options[:value]
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
entity.token_registry[:value][w].size
|
12
|
-
end
|
8
|
+
raise Treat::Exception, "Must supply a non-nil value." unless w
|
9
|
+
entity.token_registry[:value][w].nil? ? 0 :
|
10
|
+
entity.token_registry[:value][w].size
|
13
11
|
end
|
14
12
|
end
|
15
13
|
end
|
@@ -1,11 +1,12 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
-
class
|
4
|
+
class PositionInParent
|
5
5
|
# Find the position of the current entity
|
6
6
|
# inside the parent entity with type entity_type.
|
7
|
-
|
8
|
-
|
7
|
+
# Not implemented.
|
8
|
+
def self.statistics(entity, options = {})
|
9
|
+
entity.parent.children.index(entity)
|
9
10
|
end
|
10
11
|
end
|
11
12
|
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Statistics
|
4
|
+
# "The term count in the given document is simply the
|
5
|
+
# number of times a given term appears in that document.
|
6
|
+
# This count is usually normalized to prevent a bias
|
7
|
+
# towards longer documents (which may have a higher
|
8
|
+
# term count regardless of the actual importance of
|
9
|
+
# that term in the document) to give a measure of the
|
10
|
+
# importance of the term t within the particular document d.
|
11
|
+
# Thus we have the term frequency tf(t,d), defined in the
|
12
|
+
# simplest case as the occurrence count of a term in a document.
|
13
|
+
#
|
14
|
+
# The inverse document frequency is a measure of the general
|
15
|
+
# importance of the term (obtained by dividing the total number
|
16
|
+
# of documents by the number of documents containing the term,
|
17
|
+
# and then taking the logarithm of that quotient)."
|
18
|
+
#
|
19
|
+
# (From Wikipedia)
|
20
|
+
class TfIdf
|
21
|
+
DefaultOptions = { type: nil }
|
22
|
+
def self.statistics(entity, options={})
|
23
|
+
tf = entity.frequency_in(:document)
|
24
|
+
tf = tf / entity.root.word_count
|
25
|
+
d = entity.root.document_count
|
26
|
+
i = 0
|
27
|
+
entity.root.each_document do |document|
|
28
|
+
i += 1 if document.frequency_of(entity.value)
|
29
|
+
end
|
30
|
+
idf = ::Math.log(d.to_f/(i.to_f + 1)).abs
|
31
|
+
tf.to_f/idf.to_f
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -1,23 +1,23 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
+
# Experimental algorithm to generate transition matrices.
|
4
5
|
class TransitionMatrix
|
5
|
-
|
6
|
+
DefaultOptions = {
|
7
|
+
normalize: true,
|
8
|
+
features: [:tag],
|
9
|
+
condition: lambda { |e| true },
|
10
|
+
entity_types: [:word],
|
11
|
+
relationships: [:parent, :right, :children]
|
12
|
+
}
|
6
13
|
# Find the transition matrix.
|
7
14
|
def self.statistics(entity, options={})
|
8
|
-
|
9
|
-
normalize = options[:normalize] || true
|
10
|
-
features = options[:features] || [:tag]
|
11
|
-
condition = options[:condition] || lambda { |e| true }
|
12
|
-
entity_types = options[:entity_types] ? options[:entity_types] :
|
13
|
-
[options[:entity_type]]
|
14
|
-
relationships = options[:relationships] ||
|
15
|
-
[:parent, :left, :right, :children]
|
15
|
+
options = DefaultOptions.merge(options)
|
16
16
|
|
17
17
|
# Create lambdas to generate the arrays.
|
18
|
-
empty_prototype = {}; features.each { |f| empty_prototype[f] = {} }
|
18
|
+
empty_prototype = {}; options[:features].each { |f| empty_prototype[f] = {} }
|
19
19
|
empty = lambda { Marshal.load(Marshal.dump(empty_prototype)) }
|
20
|
-
empty2_prototype = {}; relationships.each { |r| empty2_prototype[r] = empty.call }
|
20
|
+
empty2_prototype = {}; options[:relationships].each { |r| empty2_prototype[r] = empty.call }
|
21
21
|
empty2 = lambda { Marshal.load(Marshal.dump(empty2_prototype)) }
|
22
22
|
|
23
23
|
# Deep (recursive) merger.
|
@@ -27,24 +27,25 @@ module Treat
|
|
27
27
|
|
28
28
|
# Master matrix.
|
29
29
|
mm = nil
|
30
|
+
tm = empty.call
|
30
31
|
|
31
|
-
entity.each_entity(*entity_types) do |target|
|
32
|
-
|
33
|
-
next unless condition.call(target)
|
32
|
+
entity.each_entity(*options[:entity_types]) do |target|
|
33
|
+
|
34
|
+
next unless options[:condition].call(target)
|
34
35
|
|
35
36
|
# Initialize the empty transition matrix.
|
36
|
-
|
37
|
+
|
37
38
|
|
38
39
|
# Calculate the transition probabilities.
|
39
|
-
features.each do |f1|
|
40
|
+
options[:features].each do |f1|
|
40
41
|
|
41
42
|
v1 = target.send(f1)
|
42
43
|
tm[f1][v1] = empty2.call
|
43
44
|
|
44
|
-
relationships.each do |relationship|
|
45
|
+
options[:relationships].each do |relationship|
|
45
46
|
tm[f1][v1][relationship] = empty.call
|
46
|
-
|
47
|
-
features.each do |f2|
|
47
|
+
|
48
|
+
options[:features].each do |f2|
|
48
49
|
relatives = target.send(relationship)
|
49
50
|
relatives = [relatives] unless relatives.is_a? Array
|
50
51
|
relatives.each do |relative|
|
@@ -55,9 +56,9 @@ module Treat
|
|
55
56
|
tm[f1][v1][relationship][f2][v2] += 1.0
|
56
57
|
end
|
57
58
|
end
|
58
|
-
|
59
|
+
|
59
60
|
tm[f1][v1][:edge] = empty.call
|
60
|
-
|
61
|
+
|
61
62
|
target.edges.each do |id, edge_type|
|
62
63
|
s = target.ancestor_with_type :sentence
|
63
64
|
if s
|
@@ -68,14 +69,13 @@ module Treat
|
|
68
69
|
tm[f1][v1][:edge][f2][v2] += 1.0
|
69
70
|
end
|
70
71
|
end
|
71
|
-
|
72
|
+
|
72
73
|
end
|
73
74
|
end
|
74
75
|
end
|
75
|
-
|
76
|
-
mm = mm ? mm.merge(tm, &merger) : tm
|
77
76
|
end
|
78
|
-
|
77
|
+
mm = mm ? mm.merge(tm, &merger) : tm
|
78
|
+
if options[:normalize]
|
79
79
|
normalize(mm)
|
80
80
|
else
|
81
81
|
mm
|