treat 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +7 -8
- data/TODO +16 -13
- data/examples/keywords.rb +89 -1
- data/lib/treat/buildable.rb +1 -8
- data/lib/treat/categories.rb +3 -4
- data/lib/treat/category.rb +1 -1
- data/lib/treat/delegatable.rb +1 -1
- data/lib/treat/detectors/encoding/native.rb +5 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
- data/lib/treat/detectors/language/language_detector.rb +4 -0
- data/lib/treat/detectors/language/what_language.rb +4 -4
- data/lib/treat/detectors.rb +1 -1
- data/lib/treat/entities/entity.rb +5 -3
- data/lib/treat/entities/tokens.rb +14 -5
- data/lib/treat/entities/zones.rb +4 -0
- data/lib/treat/entities.rb +7 -5
- data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
- data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
- data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
- data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
- data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
- data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
- data/lib/treat/extractors/time/chronic.rb +8 -0
- data/lib/treat/extractors/time/native.rb +6 -0
- data/lib/treat/extractors/time/nickel.rb +31 -23
- data/lib/treat/extractors/topic_words/lda.rb +21 -16
- data/lib/treat/extractors/topics/reuters.rb +6 -4
- data/lib/treat/extractors.rb +7 -7
- data/lib/treat/formatters/readers/abw.rb +32 -0
- data/lib/treat/formatters/readers/autoselect.rb +13 -11
- data/lib/treat/formatters/readers/doc.rb +13 -0
- data/lib/treat/formatters/readers/gocr.rb +2 -0
- data/lib/treat/formatters/readers/html.rb +21 -1
- data/lib/treat/formatters/readers/ocropus.rb +3 -3
- data/lib/treat/formatters/readers/odt.rb +41 -0
- data/lib/treat/formatters/readers/pdf.rb +5 -2
- data/lib/treat/formatters/readers/txt.rb +2 -0
- data/lib/treat/formatters/serializers/xml.rb +3 -2
- data/lib/treat/formatters/serializers/yaml.rb +2 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
- data/lib/treat/formatters/unserializers/xml.rb +6 -1
- data/lib/treat/formatters/unserializers/yaml.rb +5 -1
- data/lib/treat/formatters/visualizers/dot.rb +35 -37
- data/lib/treat/formatters/visualizers/html.rb +1 -0
- data/lib/treat/formatters/visualizers/inspect.rb +4 -0
- data/lib/treat/formatters/visualizers/short_value.rb +18 -3
- data/lib/treat/formatters/visualizers/standoff.rb +11 -6
- data/lib/treat/formatters/visualizers/tree.rb +5 -1
- data/lib/treat/formatters/visualizers/txt.rb +6 -1
- data/lib/treat/formatters.rb +1 -1
- data/lib/treat/group.rb +4 -3
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
- data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
- data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
- data/lib/treat/inflectors/stem/porter.rb +6 -2
- data/lib/treat/inflectors/stem/porter_c.rb +4 -1
- data/lib/treat/inflectors/stem/uea.rb +4 -4
- data/lib/treat/languages/english/tags.rb +16 -0
- data/lib/treat/languages/english.rb +4 -1
- data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
- data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
- data/lib/treat/lexicalizers/tag/brill.rb +3 -11
- data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
- data/lib/treat/lexicalizers.rb +0 -2
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +3 -17
- data/lib/treat/processors/parsers/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/punkt.rb +1 -0
- data/lib/treat/processors/segmenters/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/tactful.rb +4 -1
- data/lib/treat/processors/tokenizers/punkt.rb +1 -2
- data/lib/treat/processors/tokenizers/stanford.rb +4 -0
- data/lib/treat/processors/tokenizers/tactful.rb +1 -1
- data/lib/treat/processors.rb +4 -4
- data/lib/treat/proxies.rb +18 -11
- data/lib/treat/registrable.rb +12 -5
- data/lib/treat/sugar.rb +8 -3
- data/lib/treat/tree.rb +10 -3
- data/lib/treat.rb +55 -55
- data/test/tc_entity.rb +7 -7
- data/test/tc_extractors.rb +6 -4
- data/test/tc_formatters.rb +0 -4
- data/test/tests.rb +2 -0
- data/test/texts.rb +4 -4
- metadata +48 -56
- data/examples/texts/bugged_out.txt +0 -26
- data/examples/texts/half_cocked_basel.txt +0 -16
- data/examples/texts/hedge_funds.txt +0 -24
- data/examples/texts/hose_and_dry.txt +0 -19
- data/examples/texts/hungarys_troubles.txt +0 -46
- data/examples/texts/indias_slowdown.txt +0 -15
- data/examples/texts/merkozy_rides_again.txt +0 -24
- data/examples/texts/prada_is_not_walmart.txt +0 -9
- data/examples/texts/republican_nomination.txt +0 -26
- data/examples/texts/to_infinity_and_beyond.txt +0 -15
- data/lib/treat/entities/text.rb +0 -7
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
- data/lib/treat/formatters/cleaners/html.rb +0 -17
data/lib/treat/lexicalizers.rb
CHANGED
@@ -29,7 +29,6 @@ module Treat
|
|
29
29
|
extend Group
|
30
30
|
self.type = :annotator
|
31
31
|
self.targets = [:word, :number]
|
32
|
-
|
33
32
|
def self.synonyms(entity, synsets)
|
34
33
|
synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
|
35
34
|
end
|
@@ -42,7 +41,6 @@ module Treat
|
|
42
41
|
def self.hypernyms(entity, synsets)
|
43
42
|
synsets.collect { |ss| ss.hypernyms }.flatten
|
44
43
|
end
|
45
|
-
|
46
44
|
end
|
47
45
|
extend Treat::Category
|
48
46
|
end
|
@@ -5,18 +5,18 @@ module Treat
|
|
5
5
|
# zones based on a very naive analysis of the
|
6
6
|
# file.
|
7
7
|
class Txt
|
8
|
-
#
|
8
|
+
# Split a document into Zone objects.
|
9
9
|
def self.chunk(text, options = {})
|
10
10
|
zones = text.to_s.split("\n")
|
11
11
|
zones.each do |zone|
|
12
12
|
next if zone.strip == ''
|
13
13
|
if false # fix
|
14
|
-
text << Entities::List.new(zone)
|
14
|
+
text << Treat::Entities::List.new(zone)
|
15
15
|
end
|
16
16
|
if zone.length < 60
|
17
|
-
text << Entities::Title.new(zone)
|
17
|
+
text << Treat::Entities::Title.new(zone)
|
18
18
|
else
|
19
|
-
text << Entities::Paragraph.new(zone)
|
19
|
+
text << Treat::Entities::Paragraph.new(zone)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
text
|
@@ -20,21 +20,6 @@ module Treat
|
|
20
20
|
@@i = 0
|
21
21
|
# Require the Nokogiri XML parser.
|
22
22
|
require 'nokogiri'
|
23
|
-
# Maps Enju categories to Treat categories.
|
24
|
-
CategoryMap = {
|
25
|
-
'ADJ' => :adjective,
|
26
|
-
'ADV' => :adverb,
|
27
|
-
'CONJ' => :conjunction,
|
28
|
-
'COOD' => :conjunction,
|
29
|
-
'C' => :complementizer,
|
30
|
-
'D' => :determiner,
|
31
|
-
'N' => :noun,
|
32
|
-
'P' => :preposition,
|
33
|
-
'PN' => :punctuation,
|
34
|
-
'SC' => :conjunction,
|
35
|
-
'V' => :verb,
|
36
|
-
'PRT' => :particle
|
37
|
-
}
|
38
23
|
# Return the process running Enju.
|
39
24
|
def self.proc
|
40
25
|
if @@parsers.size < @@options[:processes]
|
@@ -55,7 +40,8 @@ module Treat
|
|
55
40
|
text = entity.to_s + '.'
|
56
41
|
else
|
57
42
|
remove_last = false
|
58
|
-
text = entity.to_s.gsub('.', '')
|
43
|
+
text = entity.to_s.gsub('.', '')
|
44
|
+
text += '.' unless ['!', '?'].include?(text[-1])
|
59
45
|
end
|
60
46
|
stdin.puts(text + "\n")
|
61
47
|
parsed = build(stdout.gets, remove_last)
|
@@ -114,7 +100,7 @@ module Treat
|
|
114
100
|
new_attributes[:saturated] = (value[-1] == 'P')
|
115
101
|
value = value[0..-2]
|
116
102
|
end
|
117
|
-
cat =
|
103
|
+
cat = Treat::Languages::English::EnjuCatToCategory[value]
|
118
104
|
new_attributes[:cat] = cat
|
119
105
|
else
|
120
106
|
new_attributes[:enju_cat] = value
|
@@ -1,6 +1,7 @@
|
|
1
1
|
module Treat
|
2
2
|
module Processors
|
3
3
|
module Parsers
|
4
|
+
# A wrapper class for the Stanford parser.
|
4
5
|
class Stanford
|
5
6
|
# Require the Ruby-Java bridge.
|
6
7
|
silence_warnings { require 'rjb' }
|
@@ -13,6 +14,7 @@ module Treat
|
|
13
14
|
Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
|
14
15
|
LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
|
15
16
|
@@parsers = {}
|
17
|
+
# Parse the entity using the Stanford parser.
|
16
18
|
def self.parse(entity, options = {})
|
17
19
|
lang = Treat::Languages.describe(entity.language).to_s.upcase
|
18
20
|
pcfg = "#{Treat.bin}/stanford-parser*/grammar/#{lang.upcase}PCFG.ser.gz"
|
@@ -26,6 +28,8 @@ module Treat
|
|
26
28
|
recurse(parse, entity)
|
27
29
|
entity
|
28
30
|
end
|
31
|
+
# Helper method which recurses the tree supplied by
|
32
|
+
# the Stanford parser.
|
29
33
|
def self.recurse(java_node, ruby_node)
|
30
34
|
# Leaf
|
31
35
|
if java_node.num_children == 0
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Treat
|
2
2
|
module Processors
|
3
3
|
module Segmenters
|
4
|
+
# A wrapper for the sentence splitter supplied by
|
5
|
+
# the Stanford parser.
|
4
6
|
class Stanford
|
5
7
|
# Require the Ruby-Java bridge.
|
6
8
|
silence_warnings do
|
@@ -16,6 +18,8 @@ module Treat
|
|
16
18
|
::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
|
17
19
|
StringReader = ::Rjb::import('java.io.StringReader')
|
18
20
|
end
|
21
|
+
# Segment sentences using the sentence splitter supplied by
|
22
|
+
# the Stanford parser.
|
19
23
|
def self.segment(entity, options = {})
|
20
24
|
sr = StringReader.new(entity.to_s)
|
21
25
|
sit = DocumentPreprocessor.new(sr).iterator
|
@@ -8,7 +8,10 @@ module Treat
|
|
8
8
|
# based on Splitta, but has support for ‘?’ and ‘!’
|
9
9
|
# as well as primitive handling of XHTML markup.
|
10
10
|
#
|
11
|
-
# Project website:
|
11
|
+
# Project website: https://github.com/SlyShy/Tackful-Tokenizer
|
12
|
+
# Original paper: Dan Gillick. 2009. Sentence Boundary Detection
|
13
|
+
# and the Problem with the U.S. University of California, Berkeley.
|
14
|
+
# http://dgillick.com/resource/sbd_naacl_2009.pdf
|
12
15
|
class Tactful
|
13
16
|
# Require the 'tactful_tokenizer' gem.
|
14
17
|
silence_warnings { require 'tactful_tokenizer' }
|
@@ -26,12 +26,11 @@ module Treat
|
|
26
26
|
ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
|
27
27
|
RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
|
28
28
|
# Tokenize the text using the algorithm lifted from
|
29
|
-
# the Punkt tokenizer.
|
29
|
+
# the Punkt tokenizer gem.
|
30
30
|
#
|
31
31
|
# Options: none.
|
32
32
|
def self.tokenize(entity, options = {})
|
33
33
|
entity.to_s.scan(ReWordTokenizer).each do |token|
|
34
|
-
puts token
|
35
34
|
entity << Treat::Entities::Entity.from_string(token)
|
36
35
|
end
|
37
36
|
entity
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Treat
|
2
2
|
module Processors
|
3
3
|
module Tokenizers
|
4
|
+
# A wrapper for the Stanford parser's Penn-Treebank
|
5
|
+
# style tokenizer.
|
4
6
|
class Stanford
|
5
7
|
# Require the Ruby-Java bridge.
|
6
8
|
silence_warnings do
|
@@ -18,6 +20,8 @@ module Treat
|
|
18
20
|
CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
|
19
21
|
StringReader = ::Rjb::import('java.io.StringReader')
|
20
22
|
end
|
23
|
+
# Tokenize the entity using a Penn-Treebank style tokenizer
|
24
|
+
# included with the Stanford Parser.
|
21
25
|
def self.tokenize(entity, options = {})
|
22
26
|
ptbt = PTBTokenizer.new(
|
23
27
|
StringReader.new(entity.to_s),
|
@@ -41,7 +41,7 @@ module Treat
|
|
41
41
|
[/([Cc])annot/, '\1an not']
|
42
42
|
]
|
43
43
|
# Tokenize the entity using a rule-based algorithm
|
44
|
-
#
|
44
|
+
# that has been lifted from the 'tactful-tokenizer'
|
45
45
|
# gem.
|
46
46
|
def self.tokenize(entity, options = {})
|
47
47
|
s = entity.to_s
|
data/lib/treat/processors.rb
CHANGED
@@ -17,19 +17,19 @@ module Treat
|
|
17
17
|
module Chunkers
|
18
18
|
extend Group
|
19
19
|
self.type = :transformer
|
20
|
-
self.targets = [:document, :
|
20
|
+
self.targets = [:document, :zone]
|
21
21
|
end
|
22
22
|
# Segmenters split a text or zone into sentences.
|
23
23
|
module Segmenters
|
24
24
|
extend Group
|
25
25
|
self.type = :transformer
|
26
|
-
self.targets = [:document, :
|
26
|
+
self.targets = [:document, :zone]
|
27
27
|
end
|
28
28
|
# Tokenizers splits a sentence into Token objects.
|
29
29
|
module Tokenizers
|
30
30
|
extend Group
|
31
31
|
self.type = :transformer
|
32
|
-
self.targets = [:document, :
|
32
|
+
self.targets = [:document, :zone, :sentence, :constituent]
|
33
33
|
end
|
34
34
|
# Parsers split a sentence into constituent objects
|
35
35
|
# representing its syntactic structure, with the
|
@@ -37,7 +37,7 @@ module Treat
|
|
37
37
|
module Parsers
|
38
38
|
extend Group
|
39
39
|
self.type = :transformer
|
40
|
-
self.targets = [:document, :
|
40
|
+
self.targets = [:document, :zone, :sentence, :constituent]
|
41
41
|
end
|
42
42
|
# Makes all the groups autoloadable and creates the delegators.
|
43
43
|
extend Treat::Category
|
data/lib/treat/proxies.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
module Treat
|
2
|
-
# Proxies install Treat functions on
|
2
|
+
# Proxies install Treat functions on core Ruby classes.
|
3
3
|
module Proxies
|
4
4
|
# The module proxy provides functionanaty common
|
5
5
|
# to the different types of proxies.
|
6
6
|
module Proxy
|
7
|
+
# Build the entity corresponding to the proxied
|
8
|
+
# object and send the method call to the entity.
|
7
9
|
def method_missing(sym, *args, &block)
|
8
|
-
if Categories.have_method?(sym)
|
10
|
+
if Treat::Categories.have_method?(sym)
|
9
11
|
to_entity.send(sym, *args)
|
10
12
|
else
|
11
13
|
super(sym, *args, &block)
|
@@ -16,8 +18,8 @@ module Treat
|
|
16
18
|
end
|
17
19
|
end
|
18
20
|
# Install Treat functions on String objects.
|
19
|
-
module
|
20
|
-
include Proxy
|
21
|
+
module String
|
22
|
+
include Treat::Proxies::Proxy
|
21
23
|
# Save the string to the specified file.
|
22
24
|
def save(file)
|
23
25
|
File.open(file, 'w') { |f| f.write(self) }
|
@@ -28,16 +30,21 @@ module Treat
|
|
28
30
|
end
|
29
31
|
end
|
30
32
|
# Install Treat functions on Numeric objects.
|
31
|
-
module
|
32
|
-
include Proxy
|
33
|
+
module Numeric
|
34
|
+
include Treat::Proxies::Proxy
|
33
35
|
# Return the entity corresponding to the number.
|
34
36
|
def to_entity(builder = nil)
|
35
37
|
Treat::Entities::Entity.from_numeric(self)
|
36
38
|
end
|
37
39
|
end
|
38
40
|
# Install Treat functions on Array objects.
|
39
|
-
module
|
40
|
-
include Proxy
|
41
|
+
module Array
|
42
|
+
include Treat::Proxies::Proxy
|
43
|
+
# The behaviour of this proxy is special:
|
44
|
+
# if a Treat function is called on an array,
|
45
|
+
# the function will be called on each element
|
46
|
+
# of the array and a new array with the
|
47
|
+
# results will be returned.
|
41
48
|
def method_missing(sym, *args, &block)
|
42
49
|
if Category.has_method?(sym)
|
43
50
|
array = []
|
@@ -59,8 +66,8 @@ module Treat
|
|
59
66
|
end
|
60
67
|
end
|
61
68
|
# Include the proxies in the core classes.
|
62
|
-
String.class_eval { include
|
63
|
-
Numeric.class_eval { include
|
64
|
-
Array.class_eval { include
|
69
|
+
::String.class_eval { include Treat::Proxies::String }
|
70
|
+
::Numeric.class_eval { include Treat::Proxies::Numeric }
|
71
|
+
::Array.class_eval { include Treat::Proxies::Array }
|
65
72
|
end
|
66
73
|
end
|
data/lib/treat/registrable.rb
CHANGED
@@ -3,20 +3,27 @@ module Treat
|
|
3
3
|
# Registers a token in the @token_registry
|
4
4
|
# hash in the root node.
|
5
5
|
def register_token(token)
|
6
|
-
if is_root?
|
6
|
+
if is_root? || type == :document
|
7
7
|
@token_registry ||= {value: {}, id: {}}
|
8
8
|
@token_registry[:id][token.id] = token
|
9
|
-
@token_registry[:value][token.
|
10
|
-
@token_registry[:value][token.
|
9
|
+
@token_registry[:value][token.to_s] ||= []
|
10
|
+
@token_registry[:value][token.to_s] << token
|
11
|
+
if has_parent? && type == :document
|
12
|
+
@parent.register_token(token)
|
13
|
+
end
|
11
14
|
else
|
12
15
|
@parent.register_token(token)
|
13
16
|
end
|
14
17
|
end
|
15
18
|
# Find the token registry, which is
|
16
19
|
# always in the root node.
|
17
|
-
def token_registry
|
20
|
+
def token_registry(type = nil)
|
21
|
+
if self.type == type
|
22
|
+
@token_registry ||= {value: {}, id: {}}
|
23
|
+
return @token_registry
|
24
|
+
end
|
18
25
|
if has_parent?
|
19
|
-
@parent.token_registry
|
26
|
+
@parent.token_registry(type)
|
20
27
|
else
|
21
28
|
@token_registry ||= {value: {}, id: {}}
|
22
29
|
@token_registry
|
data/lib/treat/sugar.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
module Treat
|
2
|
+
# This module provides syntactic sugar in the following manner:
|
3
|
+
# all entities found under Treat::Entities will be made
|
4
|
+
# available within the global namespace. For example,
|
5
|
+
# Treat::Entities::Word can now be referred to as simply 'Word'.
|
2
6
|
module Sugar
|
7
|
+
# Installs syntactic sugar.
|
3
8
|
def edulcorate
|
4
9
|
return if @@edulcorated
|
5
10
|
@@edulcorated = true
|
@@ -13,6 +18,7 @@ module Treat
|
|
13
18
|
end
|
14
19
|
end
|
15
20
|
end
|
21
|
+
# Uninstalls syntactic sugar.
|
16
22
|
def unedulcorate
|
17
23
|
return unless @@edulcorated
|
18
24
|
@@edulcorated = false
|
@@ -24,14 +30,13 @@ module Treat
|
|
24
30
|
end
|
25
31
|
end
|
26
32
|
end
|
27
|
-
#
|
33
|
+
# Boolean - whether syntactic sugar is
|
28
34
|
# enabled or not.
|
29
35
|
def edulcorated?; @@edulcorated; end
|
30
36
|
# Syntactic sugar is disabled by default.
|
31
37
|
@@edulcorated = false
|
32
|
-
|
33
38
|
private
|
34
|
-
|
39
|
+
# Helper method, yields each entity type and class.
|
35
40
|
def each_entity_class
|
36
41
|
Treat::Entities.list.each do |entity_type|
|
37
42
|
type = :"#{cc(entity_type)}"
|
data/lib/treat/tree.rb
CHANGED
@@ -68,18 +68,20 @@ module Treat
|
|
68
68
|
end
|
69
69
|
nodes[0]
|
70
70
|
end
|
71
|
+
# Retrieve a child node by name or index.
|
71
72
|
def [](name_or_index)
|
72
73
|
if name_or_index == nil
|
73
74
|
raise Treat::Exception,
|
74
75
|
"Non-nil name or index needs to be provided."
|
75
76
|
end
|
76
77
|
if name_or_index.kind_of?(Integer) &&
|
77
|
-
name_or_index < 1000
|
78
|
+
name_or_index < 1000
|
78
79
|
@children[name_or_index]
|
79
80
|
else
|
80
81
|
@children_hash[name_or_index]
|
81
82
|
end
|
82
83
|
end
|
84
|
+
# Remove the supplied node or id of a node from the children.
|
83
85
|
def remove!(ion)
|
84
86
|
return nil unless ion
|
85
87
|
if ion.is_a? Treat::Tree::Node
|
@@ -91,6 +93,7 @@ module Treat
|
|
91
93
|
@children_hash.delete(ion)
|
92
94
|
end
|
93
95
|
end
|
96
|
+
# Remove all children.
|
94
97
|
def remove_all!
|
95
98
|
@children.each { |child| child.set_as_root! }
|
96
99
|
@children.clear
|
@@ -103,14 +106,18 @@ module Treat
|
|
103
106
|
id = @parent.children.index(self)
|
104
107
|
@parent.children.at(id + 1) if id
|
105
108
|
end
|
109
|
+
# Return the sibling N positions to the left of this one.
|
106
110
|
def left(n = 1); sibling(-1*n); end
|
111
|
+
# Return the sibling N positions to the right of this one.
|
107
112
|
def right(n = 1); sibling(1*n); end
|
113
|
+
# Return the sibling with position #pos versus
|
114
|
+
# this one. #pos can be ... -1, 0, 1, ...
|
108
115
|
def sibling(pos)
|
109
116
|
return nil if is_root?
|
110
117
|
id = @parent.children.index(self)
|
111
118
|
@parent.children.at(id + pos)
|
112
119
|
end
|
113
|
-
#
|
120
|
+
# Return all brothers and sisters of this node.
|
114
121
|
def siblings
|
115
122
|
r = @parent.children.dup
|
116
123
|
r.delete(self)
|
@@ -133,7 +140,7 @@ module Treat
|
|
133
140
|
# Does the entity have a feature ?
|
134
141
|
def has_feature?(feature)
|
135
142
|
@features.has_key?(feature) ||
|
136
|
-
|
143
|
+
[:id, :value, :children, :edges].include?(feature)
|
137
144
|
end
|
138
145
|
alias :has? :has_feature?
|
139
146
|
# Link this node to the target node with
|
data/lib/treat.rb
CHANGED
@@ -1,51 +1,50 @@
|
|
1
|
-
#
|
2
1
|
# Main namespace for Treat modules.
|
3
2
|
#
|
4
|
-
#
|
3
|
+
# === Entities
|
5
4
|
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
5
|
+
# Entities are Tree structures that represent any textual
|
6
|
+
# entity (from a collection of texts down to an individual
|
7
|
+
# word) with a value, features, children and edges linking
|
8
|
+
# it to other textual entities. Sugar provides syntactic sugar
|
9
|
+
# for Entities and can be enabled by running Treat.edulcorate.
|
11
10
|
#
|
12
|
-
#
|
11
|
+
# Here are some example of how to create entities:
|
13
12
|
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
13
|
+
# c = Collection 'folder_with_documents'
|
14
|
+
# d = Document 'filename.txt' # (or PDF, html, xml, png, jpg, gif).
|
15
|
+
# p = Paragraph 'A short story. The end.'
|
16
|
+
# s = Sentence 'That is not a sentence.'
|
17
|
+
# w = Word 'fox'
|
19
18
|
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
19
|
+
# Here's a full list of entities (subtypes in parentheses):
|
20
|
+
# Collection, Document, Zone (Section, Title, Paragraph or List),
|
21
|
+
# Sentence, Constituent (Phrase or Clause), Token (Word, Number,
|
22
|
+
# Symbol or Punctuation).
|
23
|
+
#
|
24
|
+
# === Proxies
|
24
25
|
#
|
25
|
-
#
|
26
|
+
# Proxies allow the Treat functions to be called on the core
|
27
|
+
# Ruby classes String, Numeric and Array. They build the entity
|
28
|
+
# corresponding to the supplied raw text and send the requested
|
29
|
+
# function to it.
|
26
30
|
#
|
27
|
-
#
|
28
|
-
# Ruby classes String, Numeric and Array. They build the entity
|
29
|
-
# corresponding to the supplied raw text and send the requested
|
30
|
-
# function to it.
|
31
|
-
#
|
32
|
-
# For example,
|
31
|
+
# For example,
|
33
32
|
#
|
34
|
-
#
|
33
|
+
# 'fox'.tag
|
35
34
|
#
|
36
|
-
#
|
35
|
+
# Is equivalent to:
|
37
36
|
#
|
38
|
-
#
|
39
|
-
#
|
37
|
+
# w = Word 'fox'
|
38
|
+
# w.tag
|
40
39
|
#
|
41
|
-
#
|
40
|
+
# === Functions
|
42
41
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
42
|
+
# A class is defined for each implemented algorithm performing a given
|
43
|
+
# task. These classes are clustered into groups of algorithms performing
|
44
|
+
# the same given task (Group), and the groups are clustered into Categories
|
45
|
+
# of groups performing related tasks.
|
47
46
|
#
|
48
|
-
#
|
47
|
+
# Here are the different Categories:
|
49
48
|
#
|
50
49
|
# - Detectors - Category for language, encoding, and format
|
51
50
|
# detectors.
|
@@ -60,22 +59,22 @@
|
|
60
59
|
# - Processors - Namespace for algorithms that process collections and
|
61
60
|
# documents into trees.
|
62
61
|
#
|
63
|
-
#
|
62
|
+
# === Linguistic resources
|
64
63
|
#
|
65
|
-
#
|
66
|
-
#
|
67
|
-
#
|
64
|
+
# The Languages module contains linguistic information about
|
65
|
+
# languages (full ISO-639-1 and 2 language list, tag alignments
|
66
|
+
# for three treebanks, word categories, etc.)
|
68
67
|
#
|
69
|
-
#
|
68
|
+
# === Mixins for entities.
|
70
69
|
#
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
70
|
+
# Buildable, Delegatable, Visitable and Registrable are
|
71
|
+
# or extended by Entity and provide it with the ability to be built,
|
72
|
+
# to delegate function calls, to accept visitors and to maintain a
|
73
|
+
# token registry, respectively.
|
75
74
|
#
|
76
|
-
#
|
75
|
+
# === Exception class.
|
77
76
|
#
|
78
|
-
#
|
77
|
+
# Exception defines a custom exception class for the Treat module.
|
79
78
|
#
|
80
79
|
module Treat
|
81
80
|
|
@@ -85,20 +84,20 @@ module Treat
|
|
85
84
|
end
|
86
85
|
|
87
86
|
# The current version of Treat.
|
88
|
-
VERSION = "0.1.
|
87
|
+
VERSION = "0.1.3"
|
89
88
|
|
90
|
-
#
|
89
|
+
# $LOAD_PATH << '/ruby/treat/lib/' # Remove for release
|
91
90
|
|
92
91
|
# Create class variables for the Treat module.
|
93
92
|
class << self
|
94
|
-
#
|
93
|
+
# Symbol - default language to use when detect_language is false.
|
95
94
|
attr_accessor :default_language
|
96
|
-
#
|
95
|
+
# Symbol - default encoding to use.
|
97
96
|
attr_accessor :default_encoding
|
98
97
|
# Boolean - detect language or use default?
|
99
98
|
attr_accessor :detect_language
|
100
|
-
#
|
101
|
-
# (:entity, :sentence, :zone, :
|
99
|
+
# Symbol - the ideal entity level to detect language at
|
100
|
+
# (e.g., :entity, :sentence, :zone, :section, :document)
|
102
101
|
attr_accessor :language_detection_level
|
103
102
|
# String - main folder for executable files.
|
104
103
|
attr_accessor :bin
|
@@ -117,13 +116,13 @@ module Treat
|
|
117
116
|
# Turn language detection off by default.
|
118
117
|
self.detect_language = false
|
119
118
|
# Detect the language once per text by default.
|
120
|
-
self.language_detection_level = :
|
119
|
+
self.language_detection_level = :section
|
121
120
|
# Set the lib path to that of this file.
|
122
121
|
self.lib = File.dirname(__FILE__)
|
123
122
|
# Set the paths to the bin, test and tmp folders.
|
124
|
-
self.bin = self.lib + '/../bin
|
125
|
-
self.test = self.lib + '/../test
|
126
|
-
self.tmp = self.lib + '/../tmp
|
123
|
+
self.bin = self.lib + '/../bin'
|
124
|
+
self.test = self.lib + '/../test'
|
125
|
+
self.tmp = self.lib + '/../tmp'
|
127
126
|
|
128
127
|
# Require modified core classes.
|
129
128
|
require 'treat/object'
|
@@ -137,6 +136,7 @@ module Treat
|
|
137
136
|
require 'treat/proxies'
|
138
137
|
require 'treat/sugar'
|
139
138
|
|
139
|
+
# Make sugar available when needed.
|
140
140
|
extend Sugar
|
141
141
|
|
142
142
|
end
|
data/test/tc_entity.rb
CHANGED
@@ -2,7 +2,7 @@ module Treat
|
|
2
2
|
module Tests
|
3
3
|
class TestEntity < Test::Unit::TestCase
|
4
4
|
def setup
|
5
|
-
@text = Treat::Entities::
|
5
|
+
@text = Treat::Entities::Section.new
|
6
6
|
|
7
7
|
@sentence = Treat::Entities::Sentence.new
|
8
8
|
|
@@ -14,23 +14,23 @@ module Treat
|
|
14
14
|
@adj_phrase.set :tag, 'ADJP'
|
15
15
|
|
16
16
|
@det = Treat::Entities::Word.new('The')
|
17
|
-
@det.set :
|
17
|
+
@det.set :category, :determiner
|
18
18
|
@det.set :tag, 'DT'
|
19
19
|
@det.set :tag_set, :penn
|
20
20
|
@adj = Treat::Entities::Word.new('lazy')
|
21
|
-
@adj.set :
|
21
|
+
@adj.set :category, :adjective
|
22
22
|
@adj.set :tag, 'JJ'
|
23
23
|
@adj.set :tag_set, :penn
|
24
24
|
@noun = Treat::Entities::Word.new('fox')
|
25
|
-
@noun.set :
|
25
|
+
@noun.set :category, :noun
|
26
26
|
@noun.set :tag, 'NN'
|
27
27
|
@noun.set :tag_set, :penn
|
28
28
|
@aux = Treat::Entities::Word.new('is')
|
29
|
-
@aux.set :
|
29
|
+
@aux.set :category, :verb
|
30
30
|
@aux.set :tag, 'VBZ'
|
31
31
|
@aux.set :tag_set, :penn
|
32
32
|
@verb = Treat::Entities::Word.new('running')
|
33
|
-
@verb.set :
|
33
|
+
@verb.set :category, :verb
|
34
34
|
@verb.set :tag, 'VBG'
|
35
35
|
@verb.set :tag_set, :penn
|
36
36
|
@dot = Treat::Entities::Punctuation.new('.')
|
@@ -62,7 +62,7 @@ module Treat
|
|
62
62
|
end
|
63
63
|
|
64
64
|
def test_type
|
65
|
-
assert_equal :
|
65
|
+
assert_equal :section, @text.type
|
66
66
|
end
|
67
67
|
|
68
68
|
def test_printers
|
data/test/tc_extractors.rb
CHANGED
@@ -25,9 +25,11 @@ module Treat
|
|
25
25
|
# assert_nothing_raised { @doc.named_entity(:abner) }
|
26
26
|
end
|
27
27
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
28
|
+
def test_keywords
|
29
|
+
assert_nothing_raised do
|
30
|
+
topics = @doc.topic_words(:lda)
|
31
|
+
@doc.keywords(:topics_frequency, topic_words: topics)
|
32
|
+
end
|
31
33
|
end
|
32
34
|
|
33
35
|
def test_topics
|
@@ -38,7 +40,7 @@ module Treat
|
|
38
40
|
@doc.chunk.segment(:tactful).tokenize
|
39
41
|
|
40
42
|
assert_nothing_raised { @doc.statistics(:frequency_of, value: 'the') }
|
41
|
-
assert_nothing_raised { @word.statistics(:
|
43
|
+
assert_nothing_raised { @word.statistics(:frequency_in) }
|
42
44
|
# assert_nothing_raised { @doc.statistics(:position_in) }
|
43
45
|
# assert_nothing_raised { @doc.statistics(:transition_matrix) }
|
44
46
|
# assert_nothing_raised { @doc.statistics(:transition_probability) }
|