treat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,86 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Tag
4
+ class Stanford
5
+ # Require the Ruby-Java bridge.
6
+ silently do
7
+ require 'rjb'
8
+ jar = "#{Treat.bin}/stanford_tagger/stanford-postagger.jar"
9
+ unless File.readable?(jar)
10
+ raise "Could not find stanford tagger JAR file in #{jar}."+
11
+ " You may need to set Treat.bin to a custom value."
12
+ end
13
+ Rjb::load(
14
+ "#{Treat.bin}/stanford_tagger/stanford-postagger.jar",
15
+ ['-Xms256M', '-Xmx512M']
16
+ )
17
+ MaxentTagger = ::Rjb::import('edu.stanford.nlp.tagger.maxent.MaxentTagger')
18
+ Word = ::Rjb::import('edu.stanford.nlp.ling.Word')
19
+ List = ::Rjb::import('java.util.ArrayList')
20
+ end
21
+ # A list of models to use by language.
22
+ # Other models are available; see the models/ folder
23
+ # in the Stanford Tagger distribution files.
24
+ LanguageToModel = {
25
+ eng: 'english-left3words-distsim.tagger',
26
+ ger: 'german-fast.tagger',
27
+ fra: 'french.tagger',
28
+ ara: 'arabic-fast.tagger',
29
+ chi: 'chinese.tagger'
30
+ }
31
+ # Hold one tagger per language.
32
+ @@taggers = {}
33
+ # Hold the user-set options for each language.
34
+ @@options = {}
35
+ # Hold the default options.
36
+ DefaultOptions = {}
37
+ # Tag the word using one of the Stanford taggers.
38
+ def self.tag(entity, options = {})
39
+ lang = entity.language
40
+ # Find the model.
41
+ if options[:model]
42
+ model = options[:model]
43
+ else
44
+ model = LanguageToModel[lang]
45
+ if model.nil?
46
+ raise Treat::Exception "There exists no Stanford" +
47
+ "tagger model for language #{lang}."
48
+ end
49
+ end
50
+ # Reinitialize the tagger if the options have changed.
51
+ if options != @@options[lang]
52
+ @@options[lang] = DefaultOptions.merge(options)
53
+ @@taggers[lang] = nil # Reset the tagger
54
+ end
55
+ if @@taggers[lang].nil?
56
+ model = "#{Treat.bin}/stanford_tagger/models/#{model}"
57
+ unless File.readable?(model)
58
+ raise "Could not find a tagger model for language #{lang}: looking in #{model}."
59
+ end
60
+ silence_streams(STDOUT, STDERR) do
61
+ @@taggers[lang] =
62
+ MaxentTagger.new(model)
63
+ end
64
+ end
65
+ list = List.new
66
+ id_list = {}
67
+ i = 0
68
+ [entity].each do |word| # Fix...
69
+ list.add(Word.new(word.to_s))
70
+ id_list[i] = word
71
+ i += 1
72
+ end
73
+ it = nil
74
+ it = @@taggers[lang].apply(list).iterator
75
+ i = 0
76
+ while it.has_next
77
+ w = it.next
78
+ id_list[i].set :tag, w.tag
79
+ i += 1
80
+ end
81
+ w.tag
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,45 @@
1
+ module Treat
2
+ # Category for processor groups.
3
+ #
4
+ # A processor group is a group of algorithms for the building
5
+ # of trees representing textual entities.
6
+ #
7
+ # The processor groups include:
8
+ #
9
+ # - Chunkers : split a text into zone objects.
10
+ # - Segmenters : split a text or zone into sentence objects.
11
+ # - Tokenizers : split a sentence into Token objects.
12
+ # - Parsers: split a sentence into a tree of constituents
13
+ # containing other constituents and Token objects, representing
14
+ # the syntactic structure.
15
+ module Processors
16
+ # Chunkers split a text into zones.
17
+ module Chunkers
18
+ extend Group
19
+ self.type = :transformer
20
+ self.targets = [:document, :text]
21
+ end
22
+ # Segmenters split a text or zone into sentences.
23
+ module Segmenters
24
+ extend Group
25
+ self.type = :transformer
26
+ self.targets = [:document, :text, :zone]
27
+ end
28
+ # Tokenizers splits a sentence into Token objects.
29
+ module Tokenizers
30
+ extend Group
31
+ self.type = :transformer
32
+ self.targets = [:document, :text, :zone, :sentence, :constituent]
33
+ end
34
+ # Parsers split a sentence into constituent objects
35
+ # representing its syntactic structure, with the
36
+ # Token objects as children of the constituents.
37
+ module Parsers
38
+ extend Group
39
+ self.type = :transformer
40
+ self.targets = [:document, :text, :zone, :sentence, :constituent]
41
+ end
42
+ # Makes all the groups autoloadable and creates the delegators.
43
+ extend Treat::Category
44
+ end
45
+ end
@@ -0,0 +1,27 @@
1
+ module Treat
2
+ module Processors
3
+ module Chunkers
4
+ # This class separates a plain text file into
5
+ # zones based on a very naive analysis of the
6
+ # file.
7
+ class Txt
8
+ # Return an array of Zone objects found in the text.
9
+ def self.chunk(text, options = {})
10
+ zones = text.to_s.split("\n")
11
+ zones.each do |zone|
12
+ next if zone.strip == ''
13
+ if false # fix
14
+ text << Entities::List.new(zone)
15
+ end
16
+ if zone.length < 60
17
+ text << Entities::Title.new(zone)
18
+ else
19
+ text << Entities::Paragraph.new(zone)
20
+ end
21
+ end
22
+ text
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,214 @@
1
+ module Treat
2
+ module Processors
3
+ module Parsers
4
+ # The Enju class is a wrapper for the Enju syntactic
5
+ # parser for English. Given a file or string input,
6
+ # the parser formats it runs it through Enju, and
7
+ # parses the XML output by Enju using the Nokogiri
8
+ # XML reader. It creates wrappers for the sentences,
9
+ # syntactical constituents and tokens that Enju identified.
10
+ #
11
+ # Original paper:
12
+ # Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
13
+ # 2007. Efficient HPSG Parsing with Supertagging and
14
+ # CFG-filtering. In Proceedings of IJCAI 2007.
15
+ class Enju
16
+ # Require the 'open13' library for interaction
17
+ # with the background Enju process.
18
+ require 'open3'
19
+ @@parsers = []
20
+ @@i = 0
21
+ # Require the Nokogiri XML parser.
22
+ require 'nokogiri'
23
+ # Maps Enju categories to Treat categories.
24
+ CategoryMap = {
25
+ 'ADJ' => :adjective,
26
+ 'ADV' => :adverb,
27
+ 'CONJ' => :conjunction,
28
+ 'COOD' => :conjunction,
29
+ 'C' => :complementizer,
30
+ 'D' => :determiner,
31
+ 'N' => :noun,
32
+ 'P' => :preposition,
33
+ 'PN' => :punctuation,
34
+ 'SC' => :conjunction,
35
+ 'V' => :verb,
36
+ 'PRT' => :particle
37
+ }
38
+ # Return the process running Enju.
39
+ def self.proc
40
+ if @@parsers.size < @@options[:processes]
41
+ @@parsers << ::Open3.popen3("enju -xml -i")
42
+ end
43
+ @@i += 1
44
+ @@i = 0 if @@i == @@parsers.size
45
+ @@parsers[@@i-1]
46
+ end
47
+ # Parse the entity into its syntactical constituents
48
+ # using Enju
49
+ def self.parse(entity, options = {})
50
+ options[:processes] ||= 1
51
+ @@options = options
52
+ stdin, stdout = proc
53
+ if entity.to_s.count('.') == 0
54
+ remove_last = true
55
+ text = entity.to_s + '.'
56
+ else
57
+ remove_last = false
58
+ text = entity.to_s
59
+ end
60
+ stdin.puts(text + "\n")
61
+ parsed = build(stdout.gets, remove_last)
62
+ if not parsed.nil?
63
+ entity.remove_all!
64
+ parsed.children.each do |child|
65
+ entity << child
66
+ end
67
+ else
68
+ warn "Couldn't parse the text '#{entity.to_s}'."
69
+ end
70
+ entity
71
+ end
72
+ # Parses an Enju XML output file using the Nogoriki
73
+ # XML reader and converts that structure into a tree
74
+ # of wrappers for textual entities.
75
+ def self.build(xml, remove_last = false)
76
+ # Read in the XML file.
77
+ xml_reader = Nokogiri::XML::Reader.from_memory(xml)
78
+ current_element = nil
79
+ previous_depth = 0
80
+ id_table = {}
81
+ edges_table = {}
82
+ # Read the XML file entity by entity.
83
+ while xml_reader.read
84
+ # The depth in the XML tree.
85
+ current_depth = xml_reader.depth
86
+ # If we are at the end of the children stack, pop up.
87
+ if previous_depth > current_depth
88
+ current_element = current_element.parent
89
+ end
90
+ # If an end element has been reached,
91
+ # change the depth and pop up on next
92
+ # iteration.
93
+ if xml_reader.node_type ==
94
+ Nokogiri::XML::Reader::TYPE_END_ELEMENT
95
+ previous_depth = current_depth
96
+ next
97
+ end
98
+ attributes = xml_reader.attributes
99
+ prefix = ['schema', 'lexentry', 'type']
100
+ # If the entity has entributes, add them.
101
+ unless attributes.empty?
102
+ new_attributes = {}
103
+ edges = {}
104
+ id = attributes.delete('id')
105
+ pred = attributes.delete('pred')
106
+ attributes.each_pair do |attribute, value|
107
+ if ['arg1', 'arg2'].include?(attribute)
108
+ edges[value] = pred
109
+ else
110
+ if attribute == 'cat'
111
+ if xml_reader.name == 'tok'
112
+ if value.length > 1 && ['P', 'X'].include?(value[-1]) &&
113
+ value != 'PN'
114
+ new_attributes[:saturated] = (value[-1] == 'P')
115
+ value = value[0..-2]
116
+ end
117
+ cat = CategoryMap[value]
118
+ new_attributes[:cat] = cat
119
+ else
120
+ new_attributes[:enju_cat] = value
121
+ xcat = attributes['xcat'].split(' ')[0]
122
+ xcat ||= ''
123
+ tags = Treat::Resources::Tags::EnjuCatXcatToPTB.select do |m|
124
+ m[0] == value && m[1] == xcat
125
+ end
126
+ if tags.empty?
127
+ tag = 'UK'
128
+ else
129
+ tag = tags[0][2]
130
+ end
131
+ new_attributes[:enju_xcat] = xcat
132
+ attributes.delete('xcat')
133
+ new_attributes[:tag] = tag
134
+ end
135
+ else
136
+ pre = prefix.include?(attribute) ? 'enju_' : ''
137
+ new_attributes[:"#{pre+attribute}"] = value
138
+ end
139
+ end
140
+ end
141
+ attributes.delete('arg1')
142
+ attributes.delete('arg2')
143
+ end
144
+ # Handle naming conventions.
145
+ if attributes.has_key?('pos')
146
+ new_attributes[:tag] = new_attributes[:pos]
147
+ new_attributes.delete :pos
148
+ end
149
+ # Create the appropriate entity for the
150
+ # element.
151
+ current_value = ''
152
+ attributes = new_attributes
153
+ case xml_reader.name
154
+ when 'sentence'
155
+ current_element = Treat::Entities::Sentence.new('')
156
+ id_table[id] = current_element.id
157
+ edges_table[current_element.id] = edges
158
+ current_element.features = attributes
159
+ when 'cons'
160
+ current_element = current_element <<
161
+ Treat::Entities::Phrase.new('')
162
+ id_table[id] = current_element.id
163
+ edges_table[current_element.id] = edges
164
+ current_element.features = attributes
165
+ when 'tok'
166
+ tmp_attributes = attributes
167
+ tmp_edges = edges
168
+ else
169
+ current_value = xml_reader.value.gsub(/\s+/, "")
170
+ if !current_value.empty?
171
+ current_element = current_element <<
172
+ Treat::Entities::Entity.from_string(current_value)
173
+ if current_element.is_a?(Treat::Entities::Word)
174
+ current_element.features = tmp_attributes
175
+ id_table[id] = current_element.id
176
+ edges_table[current_element.id] = tmp_edges
177
+ end
178
+ end
179
+ end
180
+ previous_depth = current_depth
181
+ end
182
+ # Add the edges to the entity.
183
+ unless current_element.nil?
184
+ root = current_element.root
185
+ edges_table.each_pair do |id2, edges2|
186
+ # Next if there are no edges.
187
+ next if edges2.nil?
188
+ entity = root.find(id2)
189
+ edges2.each_pair do |argument, type|
190
+ # Skip this argument if we don't know
191
+ # the target node.
192
+ next if argument == 'unk'
193
+ entity.associate(id_table[argument], type)
194
+ end
195
+ end
196
+ # Link the head and sem_head to their entities.
197
+ root.each_constituent do |constituent|
198
+ constituent.set :head,
199
+ root.find(id_table[constituent.head])
200
+ constituent.set :sem_head,
201
+ root.find(id_table[constituent.sem_head])
202
+ end
203
+ end
204
+ # Remove the period we added at the end.
205
+ if remove_last
206
+ last = current_element.punctuations[-1]
207
+ current_element.remove!(last)
208
+ end
209
+ current_element
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,60 @@
1
+ module Treat
2
+ module Processors
3
+ module Parsers
4
+ class Stanford
5
+ # Require the Ruby-Java bridge.
6
+ silently { require 'rjb' }
7
+ jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
8
+ unless File.readable?(jar)
9
+ raise "Could not find stanford parser JAR file in #{jar}."+
10
+ " You may need to set Treat.bin to a custom value."
11
+ end
12
+ Rjb::load(jar, ['-Xms256M', '-Xmx512M'])
13
+ LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
14
+ @@parsers = {}
15
+ def self.parse(entity, options = {})
16
+ lang = Treat::Resources::Languages.describe(entity.language).to_s
17
+ pcfg = "#{Treat.bin}/stanford_parser/grammar/#{lang.upcase}PCFG.ser.gz"
18
+ unless File.readable?(pcfg)
19
+ raise "Could not find a language model for #{lang}: looking in #{pcfg}."
20
+ end
21
+ @@parsers[lang] ||= LexicalizedParser.new(pcfg) # Fix - check that exists.
22
+ parse = @@parsers[lang].apply(entity.to_s)
23
+ entity.remove_all!
24
+ recurse(parse, entity)
25
+ entity
26
+ end
27
+ def self.recurse(java_node, ruby_node)
28
+ # Leaf
29
+ if java_node.num_children == 0
30
+ ruby_child = Treat::Entities::Entity.from_string(java_node.value)
31
+ labels = java_node.labels.iterator
32
+ while labels.has_next
33
+ label = labels.next
34
+ ruby_child.set :begin_char, label.begin_position
35
+ ruby_child.set :end_char, label.end_position
36
+ ruby_child.set :tag, ruby_node.tag
37
+ end
38
+ ruby_node << ruby_child
39
+ else
40
+ if java_node.num_children == 1
41
+ return recurse(java_node.children[0], ruby_node)
42
+ end
43
+ java_node.children.each do |java_child|
44
+ dependencies = java_child.dependencies.iterator
45
+ # while dependencies.has_next
46
+ #dependency = dependencies.next
47
+ # end
48
+ ruby_child = Treat::Entities::Phrase.new
49
+ ruby_child.set :tag, java_child.value
50
+ ruby_node << ruby_child
51
+ unless java_child.children.empty?
52
+ recurse(java_child, ruby_child)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,48 @@
1
+ module Treat
2
+ module Processors
3
+ module Segmenters
4
+ # An adapter for the 'punk-segmenter' gem, which segments
5
+ # texts into sentences based on an unsupervised, language
6
+ # independent algorithm.
7
+ #
8
+ # Original paper: Kiss, Tibor and Strunk, Jan (2006):
9
+ # Unsupervised Multilingual Sentence Boundary Detection.
10
+ # Computational Linguistics 32: 485-525.
11
+ class Punkt
12
+ silently { require 'punkt-segmenter' }
13
+ # Hold one copy of the segmenter per language.
14
+ @@segmenters = {}
15
+ # Hold only one trainer per language.
16
+ @@trainers = {}
17
+ # Texts to train the segmenter on.
18
+ @@training_texts = {
19
+ eng: "A minute is a unit of measurement of time or of angle. The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1. In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second. The minute is not an SI unit; however, it is accepted for use with SI units. The symbol for minute or minutes is min. The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system. Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length."
20
+ }
21
+ # Segment a text using the Punkt segmenter gem.
22
+ #
23
+ # Options:
24
+ # :training_text => (String) Text to train the segmenter on.
25
+ def self.segment(entity, options = {})
26
+ lang = entity.language
27
+ training_text = options[:training_text] ?
28
+ options[:training_text] : @@training_texts[lang]
29
+ unless training_text
30
+ raise "No training text available for language #{lang}."
31
+ end
32
+ if @@trainers[lang].nil?
33
+ @@trainers[lang] = ::Punkt::Trainer.new
34
+ @@trainers[lang].train(training_text)
35
+ @@segmenters[lang] =
36
+ ::Punkt::SentenceTokenizer.new(@@trainers[lang].parameters)
37
+ end
38
+ result = @@segmenters[lang].sentences_from_text(entity.to_s,
39
+ :output => :sentences_text)
40
+ result.each do |sentence|
41
+ entity << Entities::Entity.from_string(sentence)
42
+ end
43
+ entity
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end