treat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -29,7 +29,6 @@ module Treat
29
29
  extend Group
30
30
  self.type = :annotator
31
31
  self.targets = [:word, :number]
32
-
33
32
  def self.synonyms(entity, synsets)
34
33
  synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
35
34
  end
@@ -42,7 +41,6 @@ module Treat
42
41
  def self.hypernyms(entity, synsets)
43
42
  synsets.collect { |ss| ss.hypernyms }.flatten
44
43
  end
45
-
46
44
  end
47
45
  extend Treat::Category
48
46
  end
@@ -5,18 +5,18 @@ module Treat
5
5
  # zones based on a very naive analysis of the
6
6
  # file.
7
7
  class Txt
8
- # Return an array of Zone objects found in the text.
8
+ # Split a document into Zone objects.
9
9
  def self.chunk(text, options = {})
10
10
  zones = text.to_s.split("\n")
11
11
  zones.each do |zone|
12
12
  next if zone.strip == ''
13
13
  if false # fix
14
- text << Entities::List.new(zone)
14
+ text << Treat::Entities::List.new(zone)
15
15
  end
16
16
  if zone.length < 60
17
- text << Entities::Title.new(zone)
17
+ text << Treat::Entities::Title.new(zone)
18
18
  else
19
- text << Entities::Paragraph.new(zone)
19
+ text << Treat::Entities::Paragraph.new(zone)
20
20
  end
21
21
  end
22
22
  text
@@ -20,21 +20,6 @@ module Treat
20
20
  @@i = 0
21
21
  # Require the Nokogiri XML parser.
22
22
  require 'nokogiri'
23
- # Maps Enju categories to Treat categories.
24
- CategoryMap = {
25
- 'ADJ' => :adjective,
26
- 'ADV' => :adverb,
27
- 'CONJ' => :conjunction,
28
- 'COOD' => :conjunction,
29
- 'C' => :complementizer,
30
- 'D' => :determiner,
31
- 'N' => :noun,
32
- 'P' => :preposition,
33
- 'PN' => :punctuation,
34
- 'SC' => :conjunction,
35
- 'V' => :verb,
36
- 'PRT' => :particle
37
- }
38
23
  # Return the process running Enju.
39
24
  def self.proc
40
25
  if @@parsers.size < @@options[:processes]
@@ -55,7 +40,8 @@ module Treat
55
40
  text = entity.to_s + '.'
56
41
  else
57
42
  remove_last = false
58
- text = entity.to_s.gsub('.', '') + '.' # Fix
43
+ text = entity.to_s.gsub('.', '')
44
+ text += '.' unless ['!', '?'].include?(text[-1])
59
45
  end
60
46
  stdin.puts(text + "\n")
61
47
  parsed = build(stdout.gets, remove_last)
@@ -114,7 +100,7 @@ module Treat
114
100
  new_attributes[:saturated] = (value[-1] == 'P')
115
101
  value = value[0..-2]
116
102
  end
117
- cat = CategoryMap[value]
103
+ cat = Treat::Languages::English::EnjuCatToCategory[value]
118
104
  new_attributes[:cat] = cat
119
105
  else
120
106
  new_attributes[:enju_cat] = value
@@ -1,6 +1,7 @@
1
1
  module Treat
2
2
  module Processors
3
3
  module Parsers
4
+ # A wrapper class for the Stanford parser.
4
5
  class Stanford
5
6
  # Require the Ruby-Java bridge.
6
7
  silence_warnings { require 'rjb' }
@@ -13,6 +14,7 @@ module Treat
13
14
  Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
14
15
  LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
15
16
  @@parsers = {}
17
+ # Parse the entity using the Stanford parser.
16
18
  def self.parse(entity, options = {})
17
19
  lang = Treat::Languages.describe(entity.language).to_s.upcase
18
20
  pcfg = "#{Treat.bin}/stanford-parser*/grammar/#{lang.upcase}PCFG.ser.gz"
@@ -26,6 +28,8 @@ module Treat
26
28
  recurse(parse, entity)
27
29
  entity
28
30
  end
31
+ # Helper method which recurses the tree supplied by
32
+ # the Stanford parser.
29
33
  def self.recurse(java_node, ruby_node)
30
34
  # Leaf
31
35
  if java_node.num_children == 0
@@ -21,6 +21,7 @@ module Treat
21
21
  # Segment a text using the Punkt segmenter gem.
22
22
  #
23
23
  # Options:
24
+ #
24
25
  # :training_text => (String) Text to train the segmenter on.
25
26
  def self.segment(entity, options = {})
26
27
  lang = entity.language
@@ -1,6 +1,8 @@
1
1
  module Treat
2
2
  module Processors
3
3
  module Segmenters
4
+ # A wrapper for the sentence splitter supplied by
5
+ # the Stanford parser.
4
6
  class Stanford
5
7
  # Require the Ruby-Java bridge.
6
8
  silence_warnings do
@@ -16,6 +18,8 @@ module Treat
16
18
  ::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
17
19
  StringReader = ::Rjb::import('java.io.StringReader')
18
20
  end
21
+ # Segment sentences using the sentence splitter supplied by
22
+ # the Stanford parser.
19
23
  def self.segment(entity, options = {})
20
24
  sr = StringReader.new(entity.to_s)
21
25
  sit = DocumentPreprocessor.new(sr).iterator
@@ -8,7 +8,10 @@ module Treat
8
8
  # based on Splitta, but has support for ‘?’ and ‘!’
9
9
  # as well as primitive handling of XHTML markup.
10
10
  #
11
- # Project website:
11
+ # Project website: https://github.com/SlyShy/Tackful-Tokenizer
12
+ # Original paper: Dan Gillick. 2009. Sentence Boundary Detection
13
+ # and the Problem with the U.S. University of California, Berkeley.
14
+ # http://dgillick.com/resource/sbd_naacl_2009.pdf
12
15
  class Tactful
13
16
  # Require the 'tactful_tokenizer' gem.
14
17
  silence_warnings { require 'tactful_tokenizer' }
@@ -26,12 +26,11 @@ module Treat
26
26
  ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
27
27
  RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
28
28
  # Tokenize the text using the algorithm lifted from
29
- # the Punkt tokenizer.
29
+ # the Punkt tokenizer gem.
30
30
  #
31
31
  # Options: none.
32
32
  def self.tokenize(entity, options = {})
33
33
  entity.to_s.scan(ReWordTokenizer).each do |token|
34
- puts token
35
34
  entity << Treat::Entities::Entity.from_string(token)
36
35
  end
37
36
  entity
@@ -1,6 +1,8 @@
1
1
  module Treat
2
2
  module Processors
3
3
  module Tokenizers
4
+ # A wrapper for the Stanford parser's Penn-Treebank
5
+ # style tokenizer.
4
6
  class Stanford
5
7
  # Require the Ruby-Java bridge.
6
8
  silence_warnings do
@@ -18,6 +20,8 @@ module Treat
18
20
  CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
19
21
  StringReader = ::Rjb::import('java.io.StringReader')
20
22
  end
23
+ # Tokenize the entity using a Penn-Treebank style tokenizer
24
+ # included with the Stanford Parser.
21
25
  def self.tokenize(entity, options = {})
22
26
  ptbt = PTBTokenizer.new(
23
27
  StringReader.new(entity.to_s),
@@ -41,7 +41,7 @@ module Treat
41
41
  [/([Cc])annot/, '\1an not']
42
42
  ]
43
43
  # Tokenize the entity using a rule-based algorithm
44
- # which has been lifted from the 'tactful-tokenizer'
44
+ # that has been lifted from the 'tactful-tokenizer'
45
45
  # gem.
46
46
  def self.tokenize(entity, options = {})
47
47
  s = entity.to_s
@@ -17,19 +17,19 @@ module Treat
17
17
  module Chunkers
18
18
  extend Group
19
19
  self.type = :transformer
20
- self.targets = [:document, :text]
20
+ self.targets = [:document, :zone]
21
21
  end
22
22
  # Segmenters split a text or zone into sentences.
23
23
  module Segmenters
24
24
  extend Group
25
25
  self.type = :transformer
26
- self.targets = [:document, :text, :zone]
26
+ self.targets = [:document, :zone]
27
27
  end
28
28
  # Tokenizers splits a sentence into Token objects.
29
29
  module Tokenizers
30
30
  extend Group
31
31
  self.type = :transformer
32
- self.targets = [:document, :text, :zone, :sentence, :constituent]
32
+ self.targets = [:document, :zone, :sentence, :constituent]
33
33
  end
34
34
  # Parsers split a sentence into constituent objects
35
35
  # representing its syntactic structure, with the
@@ -37,7 +37,7 @@ module Treat
37
37
  module Parsers
38
38
  extend Group
39
39
  self.type = :transformer
40
- self.targets = [:document, :text, :zone, :sentence, :constituent]
40
+ self.targets = [:document, :zone, :sentence, :constituent]
41
41
  end
42
42
  # Makes all the groups autoloadable and creates the delegators.
43
43
  extend Treat::Category
data/lib/treat/proxies.rb CHANGED
@@ -1,11 +1,13 @@
1
1
  module Treat
2
- # Proxies install Treat functions on Rubycore classes.
2
+ # Proxies install Treat functions on core Ruby classes.
3
3
  module Proxies
4
4
  # The module proxy provides functionanaty common
5
5
  # to the different types of proxies.
6
6
  module Proxy
7
+ # Build the entity corresponding to the proxied
8
+ # object and send the method call to the entity.
7
9
  def method_missing(sym, *args, &block)
8
- if Categories.have_method?(sym)
10
+ if Treat::Categories.have_method?(sym)
9
11
  to_entity.send(sym, *args)
10
12
  else
11
13
  super(sym, *args, &block)
@@ -16,8 +18,8 @@ module Treat
16
18
  end
17
19
  end
18
20
  # Install Treat functions on String objects.
19
- module StringProxy
20
- include Proxy
21
+ module String
22
+ include Treat::Proxies::Proxy
21
23
  # Save the string to the specified file.
22
24
  def save(file)
23
25
  File.open(file, 'w') { |f| f.write(self) }
@@ -28,16 +30,21 @@ module Treat
28
30
  end
29
31
  end
30
32
  # Install Treat functions on Numeric objects.
31
- module NumericProxy
32
- include Proxy
33
+ module Numeric
34
+ include Treat::Proxies::Proxy
33
35
  # Return the entity corresponding to the number.
34
36
  def to_entity(builder = nil)
35
37
  Treat::Entities::Entity.from_numeric(self)
36
38
  end
37
39
  end
38
40
  # Install Treat functions on Array objects.
39
- module ArrayProxy
40
- include Proxy
41
+ module Array
42
+ include Treat::Proxies::Proxy
43
+ # The behaviour of this proxy is special:
44
+ # if a Treat function is called on an array,
45
+ # the function will be called on each element
46
+ # of the array and a new array with the
47
+ # results will be returned.
41
48
  def method_missing(sym, *args, &block)
42
49
  if Category.has_method?(sym)
43
50
  array = []
@@ -59,8 +66,8 @@ module Treat
59
66
  end
60
67
  end
61
68
  # Include the proxies in the core classes.
62
- String.class_eval { include StringProxy }
63
- Numeric.class_eval { include NumericProxy }
64
- Array.class_eval { include ArrayProxy }
69
+ ::String.class_eval { include Treat::Proxies::String }
70
+ ::Numeric.class_eval { include Treat::Proxies::Numeric }
71
+ ::Array.class_eval { include Treat::Proxies::Array }
65
72
  end
66
73
  end
@@ -3,20 +3,27 @@ module Treat
3
3
  # Registers a token in the @token_registry
4
4
  # hash in the root node.
5
5
  def register_token(token)
6
- if is_root?
6
+ if is_root? || type == :document
7
7
  @token_registry ||= {value: {}, id: {}}
8
8
  @token_registry[:id][token.id] = token
9
- @token_registry[:value][token.value] ||= []
10
- @token_registry[:value][token.value] << token
9
+ @token_registry[:value][token.to_s] ||= []
10
+ @token_registry[:value][token.to_s] << token
11
+ if has_parent? && type == :document
12
+ @parent.register_token(token)
13
+ end
11
14
  else
12
15
  @parent.register_token(token)
13
16
  end
14
17
  end
15
18
  # Find the token registry, which is
16
19
  # always in the root node.
17
- def token_registry
20
+ def token_registry(type = nil)
21
+ if self.type == type
22
+ @token_registry ||= {value: {}, id: {}}
23
+ return @token_registry
24
+ end
18
25
  if has_parent?
19
- @parent.token_registry
26
+ @parent.token_registry(type)
20
27
  else
21
28
  @token_registry ||= {value: {}, id: {}}
22
29
  @token_registry
data/lib/treat/sugar.rb CHANGED
@@ -1,5 +1,10 @@
1
1
  module Treat
2
+ # This module provides syntactic sugar in the following manner:
3
+ # all entities found under Treat::Entities will be made
4
+ # available within the global namespace. For example,
5
+ # Treat::Entities::Word can now be referred to as simply 'Word'.
2
6
  module Sugar
7
+ # Installs syntactic sugar.
3
8
  def edulcorate
4
9
  return if @@edulcorated
5
10
  @@edulcorated = true
@@ -13,6 +18,7 @@ module Treat
13
18
  end
14
19
  end
15
20
  end
21
+ # Uninstalls syntactic sugar.
16
22
  def unedulcorate
17
23
  return unless @@edulcorated
18
24
  @@edulcorated = false
@@ -24,14 +30,13 @@ module Treat
24
30
  end
25
31
  end
26
32
  end
27
- # Whtypeher syntactic sugar is
33
+ # Boolean - whether syntactic sugar is
28
34
  # enabled or not.
29
35
  def edulcorated?; @@edulcorated; end
30
36
  # Syntactic sugar is disabled by default.
31
37
  @@edulcorated = false
32
-
33
38
  private
34
-
39
+ # Helper method, yields each entity type and class.
35
40
  def each_entity_class
36
41
  Treat::Entities.list.each do |entity_type|
37
42
  type = :"#{cc(entity_type)}"
data/lib/treat/tree.rb CHANGED
@@ -68,18 +68,20 @@ module Treat
68
68
  end
69
69
  nodes[0]
70
70
  end
71
+ # Retrieve a child node by name or index.
71
72
  def [](name_or_index)
72
73
  if name_or_index == nil
73
74
  raise Treat::Exception,
74
75
  "Non-nil name or index needs to be provided."
75
76
  end
76
77
  if name_or_index.kind_of?(Integer) &&
77
- name_or_index < 1000 # Fix
78
+ name_or_index < 1000
78
79
  @children[name_or_index]
79
80
  else
80
81
  @children_hash[name_or_index]
81
82
  end
82
83
  end
84
+ # Remove the supplied node or id of a node from the children.
83
85
  def remove!(ion)
84
86
  return nil unless ion
85
87
  if ion.is_a? Treat::Tree::Node
@@ -91,6 +93,7 @@ module Treat
91
93
  @children_hash.delete(ion)
92
94
  end
93
95
  end
96
+ # Remove all children.
94
97
  def remove_all!
95
98
  @children.each { |child| child.set_as_root! }
96
99
  @children.clear
@@ -103,14 +106,18 @@ module Treat
103
106
  id = @parent.children.index(self)
104
107
  @parent.children.at(id + 1) if id
105
108
  end
109
+ # Return the sibling N positions to the left of this one.
106
110
  def left(n = 1); sibling(-1*n); end
111
+ # Return the sibling N positions to the right of this one.
107
112
  def right(n = 1); sibling(1*n); end
113
+ # Return the sibling with position #pos versus
114
+ # this one. #pos can be ... -1, 0, 1, ...
108
115
  def sibling(pos)
109
116
  return nil if is_root?
110
117
  id = @parent.children.index(self)
111
118
  @parent.children.at(id + pos)
112
119
  end
113
- # There must be a cleaner way to do this.
120
+ # Return all brothers and sisters of this node.
114
121
  def siblings
115
122
  r = @parent.children.dup
116
123
  r.delete(self)
@@ -133,7 +140,7 @@ module Treat
133
140
  # Does the entity have a feature ?
134
141
  def has_feature?(feature)
135
142
  @features.has_key?(feature) ||
136
- feature == :value
143
+ [:id, :value, :children, :edges].include?(feature)
137
144
  end
138
145
  alias :has? :has_feature?
139
146
  # Link this node to the target node with
data/lib/treat.rb CHANGED
@@ -1,51 +1,50 @@
1
- #
2
1
  # Main namespace for Treat modules.
3
2
  #
4
- # 1. Entities
3
+ # === Entities
5
4
  #
6
- # Entities are Tree structures that represent any textual
7
- # entity (from a collection of texts down to an individual
8
- # word) with a value, features, children and edges linking
9
- # it to other textual entities. Sugar provides syntactic sugar
10
- # for Entities and can be enabled by running Treat.edulcorate.
5
+ # Entities are Tree structures that represent any textual
6
+ # entity (from a collection of texts down to an individual
7
+ # word) with a value, features, children and edges linking
8
+ # it to other textual entities. Sugar provides syntactic sugar
9
+ # for Entities and can be enabled by running Treat.edulcorate.
11
10
  #
12
- # Here are some example of how to create entities:
11
+ # Here are some example of how to create entities:
13
12
  #
14
- # c = Collection 'folder_with_documents'
15
- # d = Document 'filename.txt' # (or PDF, html, xml, png, jpg, gif).
16
- # p = Paragraph 'A short story. The end.'
17
- # s = Sentence 'That is not a sentence.'
18
- # w = Word 'fox'
13
+ # c = Collection 'folder_with_documents'
14
+ # d = Document 'filename.txt' # (or PDF, html, xml, png, jpg, gif).
15
+ # p = Paragraph 'A short story. The end.'
16
+ # s = Sentence 'That is not a sentence.'
17
+ # w = Word 'fox'
19
18
  #
20
- # Here's a full list of entities (subtypes in parentheses):
21
- # Collection, Document, Zone (Section, Title, Paragraph or List),
22
- # Sentence, Constituent (Phrase or Clause), Token (Word, Number,
23
- # Symbol or Punctuation).
19
+ # Here's a full list of entities (subtypes in parentheses):
20
+ # Collection, Document, Zone (Section, Title, Paragraph or List),
21
+ # Sentence, Constituent (Phrase or Clause), Token (Word, Number,
22
+ # Symbol or Punctuation).
23
+ #
24
+ # === Proxies
24
25
  #
25
- # 2. Proxies
26
+ # Proxies allow the Treat functions to be called on the core
27
+ # Ruby classes String, Numeric and Array. They build the entity
28
+ # corresponding to the supplied raw text and send the requested
29
+ # function to it.
26
30
  #
27
- # Proxies allow the Treat functions to be called on the core
28
- # Ruby classes String, Numeric and Array. They build the entity
29
- # corresponding to the supplied raw text and send the requested
30
- # function to it.
31
- #
32
- # For example,
31
+ # For example,
33
32
  #
34
- # 'fox'.tag
33
+ # 'fox'.tag
35
34
  #
36
- # Is equivalent to:
35
+ # Is equivalent to:
37
36
  #
38
- # w = Word 'fox'
39
- # w.tag
37
+ # w = Word 'fox'
38
+ # w.tag
40
39
  #
41
- # 3. Functions
40
+ # === Functions
42
41
  #
43
- # A class is defined for each implemented algorithm performing a given
44
- # task. These classes are clustered into groups of algorithms performing
45
- # the same given task (Group), and the groups are clustered into Categories
46
- # of groups performing related tasks.
42
+ # A class is defined for each implemented algorithm performing a given
43
+ # task. These classes are clustered into groups of algorithms performing
44
+ # the same given task (Group), and the groups are clustered into Categories
45
+ # of groups performing related tasks.
47
46
  #
48
- # Here are the different Categories:
47
+ # Here are the different Categories:
49
48
  #
50
49
  # - Detectors - Category for language, encoding, and format
51
50
  # detectors.
@@ -60,22 +59,22 @@
60
59
  # - Processors - Namespace for algorithms that process collections and
61
60
  # documents into trees.
62
61
  #
63
- # 3. Linguistic resources
62
+ # === Linguistic resources
64
63
  #
65
- # The Languages module contains linguistic information about
66
- # languages (full ISO-639-1 and 2 language list, tag alignments
67
- # for three treebanks, word categories, etc.)
64
+ # The Languages module contains linguistic information about
65
+ # languages (full ISO-639-1 and 2 language list, tag alignments
66
+ # for three treebanks, word categories, etc.)
68
67
  #
69
- # 4. Mixins for entities.
68
+ # === Mixins for entities.
70
69
  #
71
- # Buildable, Delegatable, Visitable and Registrable are
72
- # or extended by Entity and provide it with the ability to be built,
73
- # to delegate function calls, to accept visitors and to maintain a
74
- # token registry, respectively.
70
+ # Buildable, Delegatable, Visitable and Registrable are
71
+ # or extended by Entity and provide it with the ability to be built,
72
+ # to delegate function calls, to accept visitors and to maintain a
73
+ # token registry, respectively.
75
74
  #
76
- # 5. Exception
75
+ # === Exception class.
77
76
  #
78
- # Exception defines a custom exception for the Treat module.
77
+ # Exception defines a custom exception class for the Treat module.
79
78
  #
80
79
  module Treat
81
80
 
@@ -85,20 +84,20 @@ module Treat
85
84
  end
86
85
 
87
86
  # The current version of Treat.
88
- VERSION = "0.1.2"
87
+ VERSION = "0.1.3"
89
88
 
90
- # $LOAD_PATH << '/ruby/treat/lib/' # Remove for release
89
+ # $LOAD_PATH << '/ruby/treat/lib/' # Remove for release
91
90
 
92
91
  # Create class variables for the Treat module.
93
92
  class << self
94
- # Default language to use when detect_language is false
93
+ # Symbol - default language to use when detect_language is false.
95
94
  attr_accessor :default_language
96
- # Default encoding to use.
95
+ # Symbol - default encoding to use.
97
96
  attr_accessor :default_encoding
98
97
  # Boolean - detect language or use default?
99
98
  attr_accessor :detect_language
100
- # Identifier - the ideal entity level to detect language at
101
- # (:entity, :sentence, :zone, :text, :document, klass.)
99
+ # Symbol - the ideal entity level to detect language at
100
+ # (e.g., :entity, :sentence, :zone, :section, :document)
102
101
  attr_accessor :language_detection_level
103
102
  # String - main folder for executable files.
104
103
  attr_accessor :bin
@@ -117,13 +116,13 @@ module Treat
117
116
  # Turn language detection off by default.
118
117
  self.detect_language = false
119
118
  # Detect the language once per text by default.
120
- self.language_detection_level = :text
119
+ self.language_detection_level = :section
121
120
  # Set the lib path to that of this file.
122
121
  self.lib = File.dirname(__FILE__)
123
122
  # Set the paths to the bin, test and tmp folders.
124
- self.bin = self.lib + '/../bin/'
125
- self.test = self.lib + '/../test/'
126
- self.tmp = self.lib + '/../tmp/'
123
+ self.bin = self.lib + '/../bin'
124
+ self.test = self.lib + '/../test'
125
+ self.tmp = self.lib + '/../tmp'
127
126
 
128
127
  # Require modified core classes.
129
128
  require 'treat/object'
@@ -137,6 +136,7 @@ module Treat
137
136
  require 'treat/proxies'
138
137
  require 'treat/sugar'
139
138
 
139
+ # Make sugar available when needed.
140
140
  extend Sugar
141
141
 
142
142
  end
data/test/tc_entity.rb CHANGED
@@ -2,7 +2,7 @@ module Treat
2
2
  module Tests
3
3
  class TestEntity < Test::Unit::TestCase
4
4
  def setup
5
- @text = Treat::Entities::Text.new
5
+ @text = Treat::Entities::Section.new
6
6
 
7
7
  @sentence = Treat::Entities::Sentence.new
8
8
 
@@ -14,23 +14,23 @@ module Treat
14
14
  @adj_phrase.set :tag, 'ADJP'
15
15
 
16
16
  @det = Treat::Entities::Word.new('The')
17
- @det.set :cat, :determiner
17
+ @det.set :category, :determiner
18
18
  @det.set :tag, 'DT'
19
19
  @det.set :tag_set, :penn
20
20
  @adj = Treat::Entities::Word.new('lazy')
21
- @adj.set :cat, :adjective
21
+ @adj.set :category, :adjective
22
22
  @adj.set :tag, 'JJ'
23
23
  @adj.set :tag_set, :penn
24
24
  @noun = Treat::Entities::Word.new('fox')
25
- @noun.set :cat, :noun
25
+ @noun.set :category, :noun
26
26
  @noun.set :tag, 'NN'
27
27
  @noun.set :tag_set, :penn
28
28
  @aux = Treat::Entities::Word.new('is')
29
- @aux.set :cat, :verb
29
+ @aux.set :category, :verb
30
30
  @aux.set :tag, 'VBZ'
31
31
  @aux.set :tag_set, :penn
32
32
  @verb = Treat::Entities::Word.new('running')
33
- @verb.set :cat, :verb
33
+ @verb.set :category, :verb
34
34
  @verb.set :tag, 'VBG'
35
35
  @verb.set :tag_set, :penn
36
36
  @dot = Treat::Entities::Punctuation.new('.')
@@ -62,7 +62,7 @@ module Treat
62
62
  end
63
63
 
64
64
  def test_type
65
- assert_equal :text, @text.type
65
+ assert_equal :section, @text.type
66
66
  end
67
67
 
68
68
  def test_printers
@@ -25,9 +25,11 @@ module Treat
25
25
  # assert_nothing_raised { @doc.named_entity(:abner) }
26
26
  end
27
27
 
28
- def test_key_sentences
29
- topics = @doc.topic_words(:lda)
30
- assert_nothing_raised { @doc.key_sentences(:topics_frequency, topics) }
28
+ def test_keywords
29
+ assert_nothing_raised do
30
+ topics = @doc.topic_words(:lda)
31
+ @doc.keywords(:topics_frequency, topic_words: topics)
32
+ end
31
33
  end
32
34
 
33
35
  def test_topics
@@ -38,7 +40,7 @@ module Treat
38
40
  @doc.chunk.segment(:tactful).tokenize
39
41
 
40
42
  assert_nothing_raised { @doc.statistics(:frequency_of, value: 'the') }
41
- assert_nothing_raised { @word.statistics(:frequency) }
43
+ assert_nothing_raised { @word.statistics(:frequency_in) }
42
44
  # assert_nothing_raised { @doc.statistics(:position_in) }
43
45
  # assert_nothing_raised { @doc.statistics(:transition_matrix) }
44
46
  # assert_nothing_raised { @doc.statistics(:transition_probability) }
@@ -37,10 +37,6 @@ module Treat
37
37
  assert_nothing_raised { @sentence.visualize(:standoff) }
38
38
  end
39
39
 
40
- def test_cleaners
41
- assert_nothing_raised { @html_doc.clean(:html) }
42
- end
43
-
44
40
  end
45
41
  end
46
42
  end