treat 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +7 -8
- data/TODO +16 -13
- data/examples/keywords.rb +89 -1
- data/lib/treat/buildable.rb +1 -8
- data/lib/treat/categories.rb +3 -4
- data/lib/treat/category.rb +1 -1
- data/lib/treat/delegatable.rb +1 -1
- data/lib/treat/detectors/encoding/native.rb +5 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
- data/lib/treat/detectors/language/language_detector.rb +4 -0
- data/lib/treat/detectors/language/what_language.rb +4 -4
- data/lib/treat/detectors.rb +1 -1
- data/lib/treat/entities/entity.rb +5 -3
- data/lib/treat/entities/tokens.rb +14 -5
- data/lib/treat/entities/zones.rb +4 -0
- data/lib/treat/entities.rb +7 -5
- data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
- data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
- data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
- data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
- data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
- data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
- data/lib/treat/extractors/time/chronic.rb +8 -0
- data/lib/treat/extractors/time/native.rb +6 -0
- data/lib/treat/extractors/time/nickel.rb +31 -23
- data/lib/treat/extractors/topic_words/lda.rb +21 -16
- data/lib/treat/extractors/topics/reuters.rb +6 -4
- data/lib/treat/extractors.rb +7 -7
- data/lib/treat/formatters/readers/abw.rb +32 -0
- data/lib/treat/formatters/readers/autoselect.rb +13 -11
- data/lib/treat/formatters/readers/doc.rb +13 -0
- data/lib/treat/formatters/readers/gocr.rb +2 -0
- data/lib/treat/formatters/readers/html.rb +21 -1
- data/lib/treat/formatters/readers/ocropus.rb +3 -3
- data/lib/treat/formatters/readers/odt.rb +41 -0
- data/lib/treat/formatters/readers/pdf.rb +5 -2
- data/lib/treat/formatters/readers/txt.rb +2 -0
- data/lib/treat/formatters/serializers/xml.rb +3 -2
- data/lib/treat/formatters/serializers/yaml.rb +2 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
- data/lib/treat/formatters/unserializers/xml.rb +6 -1
- data/lib/treat/formatters/unserializers/yaml.rb +5 -1
- data/lib/treat/formatters/visualizers/dot.rb +35 -37
- data/lib/treat/formatters/visualizers/html.rb +1 -0
- data/lib/treat/formatters/visualizers/inspect.rb +4 -0
- data/lib/treat/formatters/visualizers/short_value.rb +18 -3
- data/lib/treat/formatters/visualizers/standoff.rb +11 -6
- data/lib/treat/formatters/visualizers/tree.rb +5 -1
- data/lib/treat/formatters/visualizers/txt.rb +6 -1
- data/lib/treat/formatters.rb +1 -1
- data/lib/treat/group.rb +4 -3
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
- data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
- data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
- data/lib/treat/inflectors/stem/porter.rb +6 -2
- data/lib/treat/inflectors/stem/porter_c.rb +4 -1
- data/lib/treat/inflectors/stem/uea.rb +4 -4
- data/lib/treat/languages/english/tags.rb +16 -0
- data/lib/treat/languages/english.rb +4 -1
- data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
- data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
- data/lib/treat/lexicalizers/tag/brill.rb +3 -11
- data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
- data/lib/treat/lexicalizers.rb +0 -2
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +3 -17
- data/lib/treat/processors/parsers/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/punkt.rb +1 -0
- data/lib/treat/processors/segmenters/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/tactful.rb +4 -1
- data/lib/treat/processors/tokenizers/punkt.rb +1 -2
- data/lib/treat/processors/tokenizers/stanford.rb +4 -0
- data/lib/treat/processors/tokenizers/tactful.rb +1 -1
- data/lib/treat/processors.rb +4 -4
- data/lib/treat/proxies.rb +18 -11
- data/lib/treat/registrable.rb +12 -5
- data/lib/treat/sugar.rb +8 -3
- data/lib/treat/tree.rb +10 -3
- data/lib/treat.rb +55 -55
- data/test/tc_entity.rb +7 -7
- data/test/tc_extractors.rb +6 -4
- data/test/tc_formatters.rb +0 -4
- data/test/tests.rb +2 -0
- data/test/texts.rb +4 -4
- metadata +48 -56
- data/examples/texts/bugged_out.txt +0 -26
- data/examples/texts/half_cocked_basel.txt +0 -16
- data/examples/texts/hedge_funds.txt +0 -24
- data/examples/texts/hose_and_dry.txt +0 -19
- data/examples/texts/hungarys_troubles.txt +0 -46
- data/examples/texts/indias_slowdown.txt +0 -15
- data/examples/texts/merkozy_rides_again.txt +0 -24
- data/examples/texts/prada_is_not_walmart.txt +0 -9
- data/examples/texts/republican_nomination.txt +0 -26
- data/examples/texts/to_infinity_and_beyond.txt +0 -15
- data/lib/treat/entities/text.rb +0 -7
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
- data/lib/treat/formatters/cleaners/html.rb +0 -17
data/lib/treat/lexicalizers.rb
CHANGED
@@ -29,7 +29,6 @@ module Treat
|
|
29
29
|
extend Group
|
30
30
|
self.type = :annotator
|
31
31
|
self.targets = [:word, :number]
|
32
|
-
|
33
32
|
def self.synonyms(entity, synsets)
|
34
33
|
synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
|
35
34
|
end
|
@@ -42,7 +41,6 @@ module Treat
|
|
42
41
|
def self.hypernyms(entity, synsets)
|
43
42
|
synsets.collect { |ss| ss.hypernyms }.flatten
|
44
43
|
end
|
45
|
-
|
46
44
|
end
|
47
45
|
extend Treat::Category
|
48
46
|
end
|
@@ -5,18 +5,18 @@ module Treat
|
|
5
5
|
# zones based on a very naive analysis of the
|
6
6
|
# file.
|
7
7
|
class Txt
|
8
|
-
#
|
8
|
+
# Split a document into Zone objects.
|
9
9
|
def self.chunk(text, options = {})
|
10
10
|
zones = text.to_s.split("\n")
|
11
11
|
zones.each do |zone|
|
12
12
|
next if zone.strip == ''
|
13
13
|
if false # fix
|
14
|
-
text << Entities::List.new(zone)
|
14
|
+
text << Treat::Entities::List.new(zone)
|
15
15
|
end
|
16
16
|
if zone.length < 60
|
17
|
-
text << Entities::Title.new(zone)
|
17
|
+
text << Treat::Entities::Title.new(zone)
|
18
18
|
else
|
19
|
-
text << Entities::Paragraph.new(zone)
|
19
|
+
text << Treat::Entities::Paragraph.new(zone)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
text
|
@@ -20,21 +20,6 @@ module Treat
|
|
20
20
|
@@i = 0
|
21
21
|
# Require the Nokogiri XML parser.
|
22
22
|
require 'nokogiri'
|
23
|
-
# Maps Enju categories to Treat categories.
|
24
|
-
CategoryMap = {
|
25
|
-
'ADJ' => :adjective,
|
26
|
-
'ADV' => :adverb,
|
27
|
-
'CONJ' => :conjunction,
|
28
|
-
'COOD' => :conjunction,
|
29
|
-
'C' => :complementizer,
|
30
|
-
'D' => :determiner,
|
31
|
-
'N' => :noun,
|
32
|
-
'P' => :preposition,
|
33
|
-
'PN' => :punctuation,
|
34
|
-
'SC' => :conjunction,
|
35
|
-
'V' => :verb,
|
36
|
-
'PRT' => :particle
|
37
|
-
}
|
38
23
|
# Return the process running Enju.
|
39
24
|
def self.proc
|
40
25
|
if @@parsers.size < @@options[:processes]
|
@@ -55,7 +40,8 @@ module Treat
|
|
55
40
|
text = entity.to_s + '.'
|
56
41
|
else
|
57
42
|
remove_last = false
|
58
|
-
text = entity.to_s.gsub('.', '')
|
43
|
+
text = entity.to_s.gsub('.', '')
|
44
|
+
text += '.' unless ['!', '?'].include?(text[-1])
|
59
45
|
end
|
60
46
|
stdin.puts(text + "\n")
|
61
47
|
parsed = build(stdout.gets, remove_last)
|
@@ -114,7 +100,7 @@ module Treat
|
|
114
100
|
new_attributes[:saturated] = (value[-1] == 'P')
|
115
101
|
value = value[0..-2]
|
116
102
|
end
|
117
|
-
cat =
|
103
|
+
cat = Treat::Languages::English::EnjuCatToCategory[value]
|
118
104
|
new_attributes[:cat] = cat
|
119
105
|
else
|
120
106
|
new_attributes[:enju_cat] = value
|
@@ -1,6 +1,7 @@
|
|
1
1
|
module Treat
|
2
2
|
module Processors
|
3
3
|
module Parsers
|
4
|
+
# A wrapper class for the Stanford parser.
|
4
5
|
class Stanford
|
5
6
|
# Require the Ruby-Java bridge.
|
6
7
|
silence_warnings { require 'rjb' }
|
@@ -13,6 +14,7 @@ module Treat
|
|
13
14
|
Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
|
14
15
|
LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
|
15
16
|
@@parsers = {}
|
17
|
+
# Parse the entity using the Stanford parser.
|
16
18
|
def self.parse(entity, options = {})
|
17
19
|
lang = Treat::Languages.describe(entity.language).to_s.upcase
|
18
20
|
pcfg = "#{Treat.bin}/stanford-parser*/grammar/#{lang.upcase}PCFG.ser.gz"
|
@@ -26,6 +28,8 @@ module Treat
|
|
26
28
|
recurse(parse, entity)
|
27
29
|
entity
|
28
30
|
end
|
31
|
+
# Helper method which recurses the tree supplied by
|
32
|
+
# the Stanford parser.
|
29
33
|
def self.recurse(java_node, ruby_node)
|
30
34
|
# Leaf
|
31
35
|
if java_node.num_children == 0
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Treat
|
2
2
|
module Processors
|
3
3
|
module Segmenters
|
4
|
+
# A wrapper for the sentence splitter supplied by
|
5
|
+
# the Stanford parser.
|
4
6
|
class Stanford
|
5
7
|
# Require the Ruby-Java bridge.
|
6
8
|
silence_warnings do
|
@@ -16,6 +18,8 @@ module Treat
|
|
16
18
|
::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
|
17
19
|
StringReader = ::Rjb::import('java.io.StringReader')
|
18
20
|
end
|
21
|
+
# Segment sentences using the sentence splitter supplied by
|
22
|
+
# the Stanford parser.
|
19
23
|
def self.segment(entity, options = {})
|
20
24
|
sr = StringReader.new(entity.to_s)
|
21
25
|
sit = DocumentPreprocessor.new(sr).iterator
|
@@ -8,7 +8,10 @@ module Treat
|
|
8
8
|
# based on Splitta, but has support for ‘?’ and ‘!’
|
9
9
|
# as well as primitive handling of XHTML markup.
|
10
10
|
#
|
11
|
-
# Project website:
|
11
|
+
# Project website: https://github.com/SlyShy/Tackful-Tokenizer
|
12
|
+
# Original paper: Dan Gillick. 2009. Sentence Boundary Detection
|
13
|
+
# and the Problem with the U.S. University of California, Berkeley.
|
14
|
+
# http://dgillick.com/resource/sbd_naacl_2009.pdf
|
12
15
|
class Tactful
|
13
16
|
# Require the 'tactful_tokenizer' gem.
|
14
17
|
silence_warnings { require 'tactful_tokenizer' }
|
@@ -26,12 +26,11 @@ module Treat
|
|
26
26
|
ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
|
27
27
|
RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
|
28
28
|
# Tokenize the text using the algorithm lifted from
|
29
|
-
# the Punkt tokenizer.
|
29
|
+
# the Punkt tokenizer gem.
|
30
30
|
#
|
31
31
|
# Options: none.
|
32
32
|
def self.tokenize(entity, options = {})
|
33
33
|
entity.to_s.scan(ReWordTokenizer).each do |token|
|
34
|
-
puts token
|
35
34
|
entity << Treat::Entities::Entity.from_string(token)
|
36
35
|
end
|
37
36
|
entity
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Treat
|
2
2
|
module Processors
|
3
3
|
module Tokenizers
|
4
|
+
# A wrapper for the Stanford parser's Penn-Treebank
|
5
|
+
# style tokenizer.
|
4
6
|
class Stanford
|
5
7
|
# Require the Ruby-Java bridge.
|
6
8
|
silence_warnings do
|
@@ -18,6 +20,8 @@ module Treat
|
|
18
20
|
CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
|
19
21
|
StringReader = ::Rjb::import('java.io.StringReader')
|
20
22
|
end
|
23
|
+
# Tokenize the entity using a Penn-Treebank style tokenizer
|
24
|
+
# included with the Stanford Parser.
|
21
25
|
def self.tokenize(entity, options = {})
|
22
26
|
ptbt = PTBTokenizer.new(
|
23
27
|
StringReader.new(entity.to_s),
|
@@ -41,7 +41,7 @@ module Treat
|
|
41
41
|
[/([Cc])annot/, '\1an not']
|
42
42
|
]
|
43
43
|
# Tokenize the entity using a rule-based algorithm
|
44
|
-
#
|
44
|
+
# that has been lifted from the 'tactful-tokenizer'
|
45
45
|
# gem.
|
46
46
|
def self.tokenize(entity, options = {})
|
47
47
|
s = entity.to_s
|
data/lib/treat/processors.rb
CHANGED
@@ -17,19 +17,19 @@ module Treat
|
|
17
17
|
module Chunkers
|
18
18
|
extend Group
|
19
19
|
self.type = :transformer
|
20
|
-
self.targets = [:document, :
|
20
|
+
self.targets = [:document, :zone]
|
21
21
|
end
|
22
22
|
# Segmenters split a text or zone into sentences.
|
23
23
|
module Segmenters
|
24
24
|
extend Group
|
25
25
|
self.type = :transformer
|
26
|
-
self.targets = [:document, :
|
26
|
+
self.targets = [:document, :zone]
|
27
27
|
end
|
28
28
|
# Tokenizers splits a sentence into Token objects.
|
29
29
|
module Tokenizers
|
30
30
|
extend Group
|
31
31
|
self.type = :transformer
|
32
|
-
self.targets = [:document, :
|
32
|
+
self.targets = [:document, :zone, :sentence, :constituent]
|
33
33
|
end
|
34
34
|
# Parsers split a sentence into constituent objects
|
35
35
|
# representing its syntactic structure, with the
|
@@ -37,7 +37,7 @@ module Treat
|
|
37
37
|
module Parsers
|
38
38
|
extend Group
|
39
39
|
self.type = :transformer
|
40
|
-
self.targets = [:document, :
|
40
|
+
self.targets = [:document, :zone, :sentence, :constituent]
|
41
41
|
end
|
42
42
|
# Makes all the groups autoloadable and creates the delegators.
|
43
43
|
extend Treat::Category
|
data/lib/treat/proxies.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
module Treat
|
2
|
-
# Proxies install Treat functions on
|
2
|
+
# Proxies install Treat functions on core Ruby classes.
|
3
3
|
module Proxies
|
4
4
|
# The module proxy provides functionanaty common
|
5
5
|
# to the different types of proxies.
|
6
6
|
module Proxy
|
7
|
+
# Build the entity corresponding to the proxied
|
8
|
+
# object and send the method call to the entity.
|
7
9
|
def method_missing(sym, *args, &block)
|
8
|
-
if Categories.have_method?(sym)
|
10
|
+
if Treat::Categories.have_method?(sym)
|
9
11
|
to_entity.send(sym, *args)
|
10
12
|
else
|
11
13
|
super(sym, *args, &block)
|
@@ -16,8 +18,8 @@ module Treat
|
|
16
18
|
end
|
17
19
|
end
|
18
20
|
# Install Treat functions on String objects.
|
19
|
-
module
|
20
|
-
include Proxy
|
21
|
+
module String
|
22
|
+
include Treat::Proxies::Proxy
|
21
23
|
# Save the string to the specified file.
|
22
24
|
def save(file)
|
23
25
|
File.open(file, 'w') { |f| f.write(self) }
|
@@ -28,16 +30,21 @@ module Treat
|
|
28
30
|
end
|
29
31
|
end
|
30
32
|
# Install Treat functions on Numeric objects.
|
31
|
-
module
|
32
|
-
include Proxy
|
33
|
+
module Numeric
|
34
|
+
include Treat::Proxies::Proxy
|
33
35
|
# Return the entity corresponding to the number.
|
34
36
|
def to_entity(builder = nil)
|
35
37
|
Treat::Entities::Entity.from_numeric(self)
|
36
38
|
end
|
37
39
|
end
|
38
40
|
# Install Treat functions on Array objects.
|
39
|
-
module
|
40
|
-
include Proxy
|
41
|
+
module Array
|
42
|
+
include Treat::Proxies::Proxy
|
43
|
+
# The behaviour of this proxy is special:
|
44
|
+
# if a Treat function is called on an array,
|
45
|
+
# the function will be called on each element
|
46
|
+
# of the array and a new array with the
|
47
|
+
# results will be returned.
|
41
48
|
def method_missing(sym, *args, &block)
|
42
49
|
if Category.has_method?(sym)
|
43
50
|
array = []
|
@@ -59,8 +66,8 @@ module Treat
|
|
59
66
|
end
|
60
67
|
end
|
61
68
|
# Include the proxies in the core classes.
|
62
|
-
String.class_eval { include
|
63
|
-
Numeric.class_eval { include
|
64
|
-
Array.class_eval { include
|
69
|
+
::String.class_eval { include Treat::Proxies::String }
|
70
|
+
::Numeric.class_eval { include Treat::Proxies::Numeric }
|
71
|
+
::Array.class_eval { include Treat::Proxies::Array }
|
65
72
|
end
|
66
73
|
end
|
data/lib/treat/registrable.rb
CHANGED
@@ -3,20 +3,27 @@ module Treat
|
|
3
3
|
# Registers a token in the @token_registry
|
4
4
|
# hash in the root node.
|
5
5
|
def register_token(token)
|
6
|
-
if is_root?
|
6
|
+
if is_root? || type == :document
|
7
7
|
@token_registry ||= {value: {}, id: {}}
|
8
8
|
@token_registry[:id][token.id] = token
|
9
|
-
@token_registry[:value][token.
|
10
|
-
@token_registry[:value][token.
|
9
|
+
@token_registry[:value][token.to_s] ||= []
|
10
|
+
@token_registry[:value][token.to_s] << token
|
11
|
+
if has_parent? && type == :document
|
12
|
+
@parent.register_token(token)
|
13
|
+
end
|
11
14
|
else
|
12
15
|
@parent.register_token(token)
|
13
16
|
end
|
14
17
|
end
|
15
18
|
# Find the token registry, which is
|
16
19
|
# always in the root node.
|
17
|
-
def token_registry
|
20
|
+
def token_registry(type = nil)
|
21
|
+
if self.type == type
|
22
|
+
@token_registry ||= {value: {}, id: {}}
|
23
|
+
return @token_registry
|
24
|
+
end
|
18
25
|
if has_parent?
|
19
|
-
@parent.token_registry
|
26
|
+
@parent.token_registry(type)
|
20
27
|
else
|
21
28
|
@token_registry ||= {value: {}, id: {}}
|
22
29
|
@token_registry
|
data/lib/treat/sugar.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
module Treat
|
2
|
+
# This module provides syntactic sugar in the following manner:
|
3
|
+
# all entities found under Treat::Entities will be made
|
4
|
+
# available within the global namespace. For example,
|
5
|
+
# Treat::Entities::Word can now be referred to as simply 'Word'.
|
2
6
|
module Sugar
|
7
|
+
# Installs syntactic sugar.
|
3
8
|
def edulcorate
|
4
9
|
return if @@edulcorated
|
5
10
|
@@edulcorated = true
|
@@ -13,6 +18,7 @@ module Treat
|
|
13
18
|
end
|
14
19
|
end
|
15
20
|
end
|
21
|
+
# Uninstalls syntactic sugar.
|
16
22
|
def unedulcorate
|
17
23
|
return unless @@edulcorated
|
18
24
|
@@edulcorated = false
|
@@ -24,14 +30,13 @@ module Treat
|
|
24
30
|
end
|
25
31
|
end
|
26
32
|
end
|
27
|
-
#
|
33
|
+
# Boolean - whether syntactic sugar is
|
28
34
|
# enabled or not.
|
29
35
|
def edulcorated?; @@edulcorated; end
|
30
36
|
# Syntactic sugar is disabled by default.
|
31
37
|
@@edulcorated = false
|
32
|
-
|
33
38
|
private
|
34
|
-
|
39
|
+
# Helper method, yields each entity type and class.
|
35
40
|
def each_entity_class
|
36
41
|
Treat::Entities.list.each do |entity_type|
|
37
42
|
type = :"#{cc(entity_type)}"
|
data/lib/treat/tree.rb
CHANGED
@@ -68,18 +68,20 @@ module Treat
|
|
68
68
|
end
|
69
69
|
nodes[0]
|
70
70
|
end
|
71
|
+
# Retrieve a child node by name or index.
|
71
72
|
def [](name_or_index)
|
72
73
|
if name_or_index == nil
|
73
74
|
raise Treat::Exception,
|
74
75
|
"Non-nil name or index needs to be provided."
|
75
76
|
end
|
76
77
|
if name_or_index.kind_of?(Integer) &&
|
77
|
-
name_or_index < 1000
|
78
|
+
name_or_index < 1000
|
78
79
|
@children[name_or_index]
|
79
80
|
else
|
80
81
|
@children_hash[name_or_index]
|
81
82
|
end
|
82
83
|
end
|
84
|
+
# Remove the supplied node or id of a node from the children.
|
83
85
|
def remove!(ion)
|
84
86
|
return nil unless ion
|
85
87
|
if ion.is_a? Treat::Tree::Node
|
@@ -91,6 +93,7 @@ module Treat
|
|
91
93
|
@children_hash.delete(ion)
|
92
94
|
end
|
93
95
|
end
|
96
|
+
# Remove all children.
|
94
97
|
def remove_all!
|
95
98
|
@children.each { |child| child.set_as_root! }
|
96
99
|
@children.clear
|
@@ -103,14 +106,18 @@ module Treat
|
|
103
106
|
id = @parent.children.index(self)
|
104
107
|
@parent.children.at(id + 1) if id
|
105
108
|
end
|
109
|
+
# Return the sibling N positions to the left of this one.
|
106
110
|
def left(n = 1); sibling(-1*n); end
|
111
|
+
# Return the sibling N positions to the right of this one.
|
107
112
|
def right(n = 1); sibling(1*n); end
|
113
|
+
# Return the sibling with position #pos versus
|
114
|
+
# this one. #pos can be ... -1, 0, 1, ...
|
108
115
|
def sibling(pos)
|
109
116
|
return nil if is_root?
|
110
117
|
id = @parent.children.index(self)
|
111
118
|
@parent.children.at(id + pos)
|
112
119
|
end
|
113
|
-
#
|
120
|
+
# Return all brothers and sisters of this node.
|
114
121
|
def siblings
|
115
122
|
r = @parent.children.dup
|
116
123
|
r.delete(self)
|
@@ -133,7 +140,7 @@ module Treat
|
|
133
140
|
# Does the entity have a feature ?
|
134
141
|
def has_feature?(feature)
|
135
142
|
@features.has_key?(feature) ||
|
136
|
-
|
143
|
+
[:id, :value, :children, :edges].include?(feature)
|
137
144
|
end
|
138
145
|
alias :has? :has_feature?
|
139
146
|
# Link this node to the target node with
|
data/lib/treat.rb
CHANGED
@@ -1,51 +1,50 @@
|
|
1
|
-
#
|
2
1
|
# Main namespace for Treat modules.
|
3
2
|
#
|
4
|
-
#
|
3
|
+
# === Entities
|
5
4
|
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
5
|
+
# Entities are Tree structures that represent any textual
|
6
|
+
# entity (from a collection of texts down to an individual
|
7
|
+
# word) with a value, features, children and edges linking
|
8
|
+
# it to other textual entities. Sugar provides syntactic sugar
|
9
|
+
# for Entities and can be enabled by running Treat.edulcorate.
|
11
10
|
#
|
12
|
-
#
|
11
|
+
# Here are some example of how to create entities:
|
13
12
|
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
13
|
+
# c = Collection 'folder_with_documents'
|
14
|
+
# d = Document 'filename.txt' # (or PDF, html, xml, png, jpg, gif).
|
15
|
+
# p = Paragraph 'A short story. The end.'
|
16
|
+
# s = Sentence 'That is not a sentence.'
|
17
|
+
# w = Word 'fox'
|
19
18
|
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
19
|
+
# Here's a full list of entities (subtypes in parentheses):
|
20
|
+
# Collection, Document, Zone (Section, Title, Paragraph or List),
|
21
|
+
# Sentence, Constituent (Phrase or Clause), Token (Word, Number,
|
22
|
+
# Symbol or Punctuation).
|
23
|
+
#
|
24
|
+
# === Proxies
|
24
25
|
#
|
25
|
-
#
|
26
|
+
# Proxies allow the Treat functions to be called on the core
|
27
|
+
# Ruby classes String, Numeric and Array. They build the entity
|
28
|
+
# corresponding to the supplied raw text and send the requested
|
29
|
+
# function to it.
|
26
30
|
#
|
27
|
-
#
|
28
|
-
# Ruby classes String, Numeric and Array. They build the entity
|
29
|
-
# corresponding to the supplied raw text and send the requested
|
30
|
-
# function to it.
|
31
|
-
#
|
32
|
-
# For example,
|
31
|
+
# For example,
|
33
32
|
#
|
34
|
-
#
|
33
|
+
# 'fox'.tag
|
35
34
|
#
|
36
|
-
#
|
35
|
+
# Is equivalent to:
|
37
36
|
#
|
38
|
-
#
|
39
|
-
#
|
37
|
+
# w = Word 'fox'
|
38
|
+
# w.tag
|
40
39
|
#
|
41
|
-
#
|
40
|
+
# === Functions
|
42
41
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
42
|
+
# A class is defined for each implemented algorithm performing a given
|
43
|
+
# task. These classes are clustered into groups of algorithms performing
|
44
|
+
# the same given task (Group), and the groups are clustered into Categories
|
45
|
+
# of groups performing related tasks.
|
47
46
|
#
|
48
|
-
#
|
47
|
+
# Here are the different Categories:
|
49
48
|
#
|
50
49
|
# - Detectors - Category for language, encoding, and format
|
51
50
|
# detectors.
|
@@ -60,22 +59,22 @@
|
|
60
59
|
# - Processors - Namespace for algorithms that process collections and
|
61
60
|
# documents into trees.
|
62
61
|
#
|
63
|
-
#
|
62
|
+
# === Linguistic resources
|
64
63
|
#
|
65
|
-
#
|
66
|
-
#
|
67
|
-
#
|
64
|
+
# The Languages module contains linguistic information about
|
65
|
+
# languages (full ISO-639-1 and 2 language list, tag alignments
|
66
|
+
# for three treebanks, word categories, etc.)
|
68
67
|
#
|
69
|
-
#
|
68
|
+
# === Mixins for entities.
|
70
69
|
#
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
70
|
+
# Buildable, Delegatable, Visitable and Registrable are
|
71
|
+
# or extended by Entity and provide it with the ability to be built,
|
72
|
+
# to delegate function calls, to accept visitors and to maintain a
|
73
|
+
# token registry, respectively.
|
75
74
|
#
|
76
|
-
#
|
75
|
+
# === Exception class.
|
77
76
|
#
|
78
|
-
#
|
77
|
+
# Exception defines a custom exception class for the Treat module.
|
79
78
|
#
|
80
79
|
module Treat
|
81
80
|
|
@@ -85,20 +84,20 @@ module Treat
|
|
85
84
|
end
|
86
85
|
|
87
86
|
# The current version of Treat.
|
88
|
-
VERSION = "0.1.
|
87
|
+
VERSION = "0.1.3"
|
89
88
|
|
90
|
-
#
|
89
|
+
# $LOAD_PATH << '/ruby/treat/lib/' # Remove for release
|
91
90
|
|
92
91
|
# Create class variables for the Treat module.
|
93
92
|
class << self
|
94
|
-
#
|
93
|
+
# Symbol - default language to use when detect_language is false.
|
95
94
|
attr_accessor :default_language
|
96
|
-
#
|
95
|
+
# Symbol - default encoding to use.
|
97
96
|
attr_accessor :default_encoding
|
98
97
|
# Boolean - detect language or use default?
|
99
98
|
attr_accessor :detect_language
|
100
|
-
#
|
101
|
-
# (:entity, :sentence, :zone, :
|
99
|
+
# Symbol - the ideal entity level to detect language at
|
100
|
+
# (e.g., :entity, :sentence, :zone, :section, :document)
|
102
101
|
attr_accessor :language_detection_level
|
103
102
|
# String - main folder for executable files.
|
104
103
|
attr_accessor :bin
|
@@ -117,13 +116,13 @@ module Treat
|
|
117
116
|
# Turn language detection off by default.
|
118
117
|
self.detect_language = false
|
119
118
|
# Detect the language once per text by default.
|
120
|
-
self.language_detection_level = :
|
119
|
+
self.language_detection_level = :section
|
121
120
|
# Set the lib path to that of this file.
|
122
121
|
self.lib = File.dirname(__FILE__)
|
123
122
|
# Set the paths to the bin, test and tmp folders.
|
124
|
-
self.bin = self.lib + '/../bin
|
125
|
-
self.test = self.lib + '/../test
|
126
|
-
self.tmp = self.lib + '/../tmp
|
123
|
+
self.bin = self.lib + '/../bin'
|
124
|
+
self.test = self.lib + '/../test'
|
125
|
+
self.tmp = self.lib + '/../tmp'
|
127
126
|
|
128
127
|
# Require modified core classes.
|
129
128
|
require 'treat/object'
|
@@ -137,6 +136,7 @@ module Treat
|
|
137
136
|
require 'treat/proxies'
|
138
137
|
require 'treat/sugar'
|
139
138
|
|
139
|
+
# Make sugar available when needed.
|
140
140
|
extend Sugar
|
141
141
|
|
142
142
|
end
|
data/test/tc_entity.rb
CHANGED
@@ -2,7 +2,7 @@ module Treat
|
|
2
2
|
module Tests
|
3
3
|
class TestEntity < Test::Unit::TestCase
|
4
4
|
def setup
|
5
|
-
@text = Treat::Entities::
|
5
|
+
@text = Treat::Entities::Section.new
|
6
6
|
|
7
7
|
@sentence = Treat::Entities::Sentence.new
|
8
8
|
|
@@ -14,23 +14,23 @@ module Treat
|
|
14
14
|
@adj_phrase.set :tag, 'ADJP'
|
15
15
|
|
16
16
|
@det = Treat::Entities::Word.new('The')
|
17
|
-
@det.set :
|
17
|
+
@det.set :category, :determiner
|
18
18
|
@det.set :tag, 'DT'
|
19
19
|
@det.set :tag_set, :penn
|
20
20
|
@adj = Treat::Entities::Word.new('lazy')
|
21
|
-
@adj.set :
|
21
|
+
@adj.set :category, :adjective
|
22
22
|
@adj.set :tag, 'JJ'
|
23
23
|
@adj.set :tag_set, :penn
|
24
24
|
@noun = Treat::Entities::Word.new('fox')
|
25
|
-
@noun.set :
|
25
|
+
@noun.set :category, :noun
|
26
26
|
@noun.set :tag, 'NN'
|
27
27
|
@noun.set :tag_set, :penn
|
28
28
|
@aux = Treat::Entities::Word.new('is')
|
29
|
-
@aux.set :
|
29
|
+
@aux.set :category, :verb
|
30
30
|
@aux.set :tag, 'VBZ'
|
31
31
|
@aux.set :tag_set, :penn
|
32
32
|
@verb = Treat::Entities::Word.new('running')
|
33
|
-
@verb.set :
|
33
|
+
@verb.set :category, :verb
|
34
34
|
@verb.set :tag, 'VBG'
|
35
35
|
@verb.set :tag_set, :penn
|
36
36
|
@dot = Treat::Entities::Punctuation.new('.')
|
@@ -62,7 +62,7 @@ module Treat
|
|
62
62
|
end
|
63
63
|
|
64
64
|
def test_type
|
65
|
-
assert_equal :
|
65
|
+
assert_equal :section, @text.type
|
66
66
|
end
|
67
67
|
|
68
68
|
def test_printers
|
data/test/tc_extractors.rb
CHANGED
@@ -25,9 +25,11 @@ module Treat
|
|
25
25
|
# assert_nothing_raised { @doc.named_entity(:abner) }
|
26
26
|
end
|
27
27
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
28
|
+
def test_keywords
|
29
|
+
assert_nothing_raised do
|
30
|
+
topics = @doc.topic_words(:lda)
|
31
|
+
@doc.keywords(:topics_frequency, topic_words: topics)
|
32
|
+
end
|
31
33
|
end
|
32
34
|
|
33
35
|
def test_topics
|
@@ -38,7 +40,7 @@ module Treat
|
|
38
40
|
@doc.chunk.segment(:tactful).tokenize
|
39
41
|
|
40
42
|
assert_nothing_raised { @doc.statistics(:frequency_of, value: 'the') }
|
41
|
-
assert_nothing_raised { @word.statistics(:
|
43
|
+
assert_nothing_raised { @word.statistics(:frequency_in) }
|
42
44
|
# assert_nothing_raised { @doc.statistics(:position_in) }
|
43
45
|
# assert_nothing_raised { @doc.statistics(:transition_matrix) }
|
44
46
|
# assert_nothing_raised { @doc.statistics(:transition_probability) }
|