treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
module Treat
|
2
|
+
module Lexicalizers
|
3
|
+
module Tag
|
4
|
+
class Stanford
|
5
|
+
# Require the Ruby-Java bridge.
|
6
|
+
silently do
|
7
|
+
require 'rjb'
|
8
|
+
jar = "#{Treat.bin}/stanford_tagger/stanford-postagger.jar"
|
9
|
+
unless File.readable?(jar)
|
10
|
+
raise "Could not find stanford tagger JAR file in #{jar}."+
|
11
|
+
" You may need to set Treat.bin to a custom value."
|
12
|
+
end
|
13
|
+
Rjb::load(
|
14
|
+
"#{Treat.bin}/stanford_tagger/stanford-postagger.jar",
|
15
|
+
['-Xms256M', '-Xmx512M']
|
16
|
+
)
|
17
|
+
MaxentTagger = ::Rjb::import('edu.stanford.nlp.tagger.maxent.MaxentTagger')
|
18
|
+
Word = ::Rjb::import('edu.stanford.nlp.ling.Word')
|
19
|
+
List = ::Rjb::import('java.util.ArrayList')
|
20
|
+
end
|
21
|
+
# A list of models to use by language.
|
22
|
+
# Other models are available; see the models/ folder
|
23
|
+
# in the Stanford Tagger distribution files.
|
24
|
+
LanguageToModel = {
|
25
|
+
eng: 'english-left3words-distsim.tagger',
|
26
|
+
ger: 'german-fast.tagger',
|
27
|
+
fra: 'french.tagger',
|
28
|
+
ara: 'arabic-fast.tagger',
|
29
|
+
chi: 'chinese.tagger'
|
30
|
+
}
|
31
|
+
# Hold one tagger per language.
|
32
|
+
@@taggers = {}
|
33
|
+
# Hold the user-set options for each language.
|
34
|
+
@@options = {}
|
35
|
+
# Hold the default options.
|
36
|
+
DefaultOptions = {}
|
37
|
+
# Tag the word using one of the Stanford taggers.
|
38
|
+
def self.tag(entity, options = {})
|
39
|
+
lang = entity.language
|
40
|
+
# Find the model.
|
41
|
+
if options[:model]
|
42
|
+
model = options[:model]
|
43
|
+
else
|
44
|
+
model = LanguageToModel[lang]
|
45
|
+
if model.nil?
|
46
|
+
raise Treat::Exception "There exists no Stanford" +
|
47
|
+
"tagger model for language #{lang}."
|
48
|
+
end
|
49
|
+
end
|
50
|
+
# Reinitialize the tagger if the options have changed.
|
51
|
+
if options != @@options[lang]
|
52
|
+
@@options[lang] = DefaultOptions.merge(options)
|
53
|
+
@@taggers[lang] = nil # Reset the tagger
|
54
|
+
end
|
55
|
+
if @@taggers[lang].nil?
|
56
|
+
model = "#{Treat.bin}/stanford_tagger/models/#{model}"
|
57
|
+
unless File.readable?(model)
|
58
|
+
raise "Could not find a tagger model for language #{lang}: looking in #{model}."
|
59
|
+
end
|
60
|
+
silence_streams(STDOUT, STDERR) do
|
61
|
+
@@taggers[lang] =
|
62
|
+
MaxentTagger.new(model)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
list = List.new
|
66
|
+
id_list = {}
|
67
|
+
i = 0
|
68
|
+
[entity].each do |word| # Fix...
|
69
|
+
list.add(Word.new(word.to_s))
|
70
|
+
id_list[i] = word
|
71
|
+
i += 1
|
72
|
+
end
|
73
|
+
it = nil
|
74
|
+
it = @@taggers[lang].apply(list).iterator
|
75
|
+
i = 0
|
76
|
+
while it.has_next
|
77
|
+
w = it.next
|
78
|
+
id_list[i].set :tag, w.tag
|
79
|
+
i += 1
|
80
|
+
end
|
81
|
+
w.tag
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Treat
|
2
|
+
# Category for processor groups.
|
3
|
+
#
|
4
|
+
# A processor group is a group of algorithms for the building
|
5
|
+
# of trees representing textual entities.
|
6
|
+
#
|
7
|
+
# The processor groups include:
|
8
|
+
#
|
9
|
+
# - Chunkers : split a text into zone objects.
|
10
|
+
# - Segmenters : split a text or zone into sentence objects.
|
11
|
+
# - Tokenizers : split a sentence into Token objects.
|
12
|
+
# - Parsers: split a sentence into a tree of constituents
|
13
|
+
# containing other constituents and Token objects, representing
|
14
|
+
# the syntactic structure.
|
15
|
+
module Processors
|
16
|
+
# Chunkers split a text into zones.
|
17
|
+
module Chunkers
|
18
|
+
extend Group
|
19
|
+
self.type = :transformer
|
20
|
+
self.targets = [:document, :text]
|
21
|
+
end
|
22
|
+
# Segmenters split a text or zone into sentences.
|
23
|
+
module Segmenters
|
24
|
+
extend Group
|
25
|
+
self.type = :transformer
|
26
|
+
self.targets = [:document, :text, :zone]
|
27
|
+
end
|
28
|
+
# Tokenizers splits a sentence into Token objects.
|
29
|
+
module Tokenizers
|
30
|
+
extend Group
|
31
|
+
self.type = :transformer
|
32
|
+
self.targets = [:document, :text, :zone, :sentence, :constituent]
|
33
|
+
end
|
34
|
+
# Parsers split a sentence into constituent objects
|
35
|
+
# representing its syntactic structure, with the
|
36
|
+
# Token objects as children of the constituents.
|
37
|
+
module Parsers
|
38
|
+
extend Group
|
39
|
+
self.type = :transformer
|
40
|
+
self.targets = [:document, :text, :zone, :sentence, :constituent]
|
41
|
+
end
|
42
|
+
# Makes all the groups autoloadable and creates the delegators.
|
43
|
+
extend Treat::Category
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Chunkers
|
4
|
+
# This class separates a plain text file into
|
5
|
+
# zones based on a very naive analysis of the
|
6
|
+
# file.
|
7
|
+
class Txt
|
8
|
+
# Return an array of Zone objects found in the text.
|
9
|
+
def self.chunk(text, options = {})
|
10
|
+
zones = text.to_s.split("\n")
|
11
|
+
zones.each do |zone|
|
12
|
+
next if zone.strip == ''
|
13
|
+
if false # fix
|
14
|
+
text << Entities::List.new(zone)
|
15
|
+
end
|
16
|
+
if zone.length < 60
|
17
|
+
text << Entities::Title.new(zone)
|
18
|
+
else
|
19
|
+
text << Entities::Paragraph.new(zone)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
text
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,214 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Parsers
|
4
|
+
# The Enju class is a wrapper for the Enju syntactic
|
5
|
+
# parser for English. Given a file or string input,
|
6
|
+
# the parser formats it runs it through Enju, and
|
7
|
+
# parses the XML output by Enju using the Nokogiri
|
8
|
+
# XML reader. It creates wrappers for the sentences,
|
9
|
+
# syntactical constituents and tokens that Enju identified.
|
10
|
+
#
|
11
|
+
# Original paper:
|
12
|
+
# Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
|
13
|
+
# 2007. Efficient HPSG Parsing with Supertagging and
|
14
|
+
# CFG-filtering. In Proceedings of IJCAI 2007.
|
15
|
+
class Enju
|
16
|
+
# Require the 'open13' library for interaction
|
17
|
+
# with the background Enju process.
|
18
|
+
require 'open3'
|
19
|
+
@@parsers = []
|
20
|
+
@@i = 0
|
21
|
+
# Require the Nokogiri XML parser.
|
22
|
+
require 'nokogiri'
|
23
|
+
# Maps Enju categories to Treat categories.
|
24
|
+
CategoryMap = {
|
25
|
+
'ADJ' => :adjective,
|
26
|
+
'ADV' => :adverb,
|
27
|
+
'CONJ' => :conjunction,
|
28
|
+
'COOD' => :conjunction,
|
29
|
+
'C' => :complementizer,
|
30
|
+
'D' => :determiner,
|
31
|
+
'N' => :noun,
|
32
|
+
'P' => :preposition,
|
33
|
+
'PN' => :punctuation,
|
34
|
+
'SC' => :conjunction,
|
35
|
+
'V' => :verb,
|
36
|
+
'PRT' => :particle
|
37
|
+
}
|
38
|
+
# Return the process running Enju.
|
39
|
+
def self.proc
|
40
|
+
if @@parsers.size < @@options[:processes]
|
41
|
+
@@parsers << ::Open3.popen3("enju -xml -i")
|
42
|
+
end
|
43
|
+
@@i += 1
|
44
|
+
@@i = 0 if @@i == @@parsers.size
|
45
|
+
@@parsers[@@i-1]
|
46
|
+
end
|
47
|
+
# Parse the entity into its syntactical constituents
|
48
|
+
# using Enju
|
49
|
+
def self.parse(entity, options = {})
|
50
|
+
options[:processes] ||= 1
|
51
|
+
@@options = options
|
52
|
+
stdin, stdout = proc
|
53
|
+
if entity.to_s.count('.') == 0
|
54
|
+
remove_last = true
|
55
|
+
text = entity.to_s + '.'
|
56
|
+
else
|
57
|
+
remove_last = false
|
58
|
+
text = entity.to_s
|
59
|
+
end
|
60
|
+
stdin.puts(text + "\n")
|
61
|
+
parsed = build(stdout.gets, remove_last)
|
62
|
+
if not parsed.nil?
|
63
|
+
entity.remove_all!
|
64
|
+
parsed.children.each do |child|
|
65
|
+
entity << child
|
66
|
+
end
|
67
|
+
else
|
68
|
+
warn "Couldn't parse the text '#{entity.to_s}'."
|
69
|
+
end
|
70
|
+
entity
|
71
|
+
end
|
72
|
+
# Parses an Enju XML output file using the Nogoriki
|
73
|
+
# XML reader and converts that structure into a tree
|
74
|
+
# of wrappers for textual entities.
|
75
|
+
def self.build(xml, remove_last = false)
|
76
|
+
# Read in the XML file.
|
77
|
+
xml_reader = Nokogiri::XML::Reader.from_memory(xml)
|
78
|
+
current_element = nil
|
79
|
+
previous_depth = 0
|
80
|
+
id_table = {}
|
81
|
+
edges_table = {}
|
82
|
+
# Read the XML file entity by entity.
|
83
|
+
while xml_reader.read
|
84
|
+
# The depth in the XML tree.
|
85
|
+
current_depth = xml_reader.depth
|
86
|
+
# If we are at the end of the children stack, pop up.
|
87
|
+
if previous_depth > current_depth
|
88
|
+
current_element = current_element.parent
|
89
|
+
end
|
90
|
+
# If an end element has been reached,
|
91
|
+
# change the depth and pop up on next
|
92
|
+
# iteration.
|
93
|
+
if xml_reader.node_type ==
|
94
|
+
Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
95
|
+
previous_depth = current_depth
|
96
|
+
next
|
97
|
+
end
|
98
|
+
attributes = xml_reader.attributes
|
99
|
+
prefix = ['schema', 'lexentry', 'type']
|
100
|
+
# If the entity has entributes, add them.
|
101
|
+
unless attributes.empty?
|
102
|
+
new_attributes = {}
|
103
|
+
edges = {}
|
104
|
+
id = attributes.delete('id')
|
105
|
+
pred = attributes.delete('pred')
|
106
|
+
attributes.each_pair do |attribute, value|
|
107
|
+
if ['arg1', 'arg2'].include?(attribute)
|
108
|
+
edges[value] = pred
|
109
|
+
else
|
110
|
+
if attribute == 'cat'
|
111
|
+
if xml_reader.name == 'tok'
|
112
|
+
if value.length > 1 && ['P', 'X'].include?(value[-1]) &&
|
113
|
+
value != 'PN'
|
114
|
+
new_attributes[:saturated] = (value[-1] == 'P')
|
115
|
+
value = value[0..-2]
|
116
|
+
end
|
117
|
+
cat = CategoryMap[value]
|
118
|
+
new_attributes[:cat] = cat
|
119
|
+
else
|
120
|
+
new_attributes[:enju_cat] = value
|
121
|
+
xcat = attributes['xcat'].split(' ')[0]
|
122
|
+
xcat ||= ''
|
123
|
+
tags = Treat::Resources::Tags::EnjuCatXcatToPTB.select do |m|
|
124
|
+
m[0] == value && m[1] == xcat
|
125
|
+
end
|
126
|
+
if tags.empty?
|
127
|
+
tag = 'UK'
|
128
|
+
else
|
129
|
+
tag = tags[0][2]
|
130
|
+
end
|
131
|
+
new_attributes[:enju_xcat] = xcat
|
132
|
+
attributes.delete('xcat')
|
133
|
+
new_attributes[:tag] = tag
|
134
|
+
end
|
135
|
+
else
|
136
|
+
pre = prefix.include?(attribute) ? 'enju_' : ''
|
137
|
+
new_attributes[:"#{pre+attribute}"] = value
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
attributes.delete('arg1')
|
142
|
+
attributes.delete('arg2')
|
143
|
+
end
|
144
|
+
# Handle naming conventions.
|
145
|
+
if attributes.has_key?('pos')
|
146
|
+
new_attributes[:tag] = new_attributes[:pos]
|
147
|
+
new_attributes.delete :pos
|
148
|
+
end
|
149
|
+
# Create the appropriate entity for the
|
150
|
+
# element.
|
151
|
+
current_value = ''
|
152
|
+
attributes = new_attributes
|
153
|
+
case xml_reader.name
|
154
|
+
when 'sentence'
|
155
|
+
current_element = Treat::Entities::Sentence.new('')
|
156
|
+
id_table[id] = current_element.id
|
157
|
+
edges_table[current_element.id] = edges
|
158
|
+
current_element.features = attributes
|
159
|
+
when 'cons'
|
160
|
+
current_element = current_element <<
|
161
|
+
Treat::Entities::Phrase.new('')
|
162
|
+
id_table[id] = current_element.id
|
163
|
+
edges_table[current_element.id] = edges
|
164
|
+
current_element.features = attributes
|
165
|
+
when 'tok'
|
166
|
+
tmp_attributes = attributes
|
167
|
+
tmp_edges = edges
|
168
|
+
else
|
169
|
+
current_value = xml_reader.value.gsub(/\s+/, "")
|
170
|
+
if !current_value.empty?
|
171
|
+
current_element = current_element <<
|
172
|
+
Treat::Entities::Entity.from_string(current_value)
|
173
|
+
if current_element.is_a?(Treat::Entities::Word)
|
174
|
+
current_element.features = tmp_attributes
|
175
|
+
id_table[id] = current_element.id
|
176
|
+
edges_table[current_element.id] = tmp_edges
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
previous_depth = current_depth
|
181
|
+
end
|
182
|
+
# Add the edges to the entity.
|
183
|
+
unless current_element.nil?
|
184
|
+
root = current_element.root
|
185
|
+
edges_table.each_pair do |id2, edges2|
|
186
|
+
# Next if there are no edges.
|
187
|
+
next if edges2.nil?
|
188
|
+
entity = root.find(id2)
|
189
|
+
edges2.each_pair do |argument, type|
|
190
|
+
# Skip this argument if we don't know
|
191
|
+
# the target node.
|
192
|
+
next if argument == 'unk'
|
193
|
+
entity.associate(id_table[argument], type)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
# Link the head and sem_head to their entities.
|
197
|
+
root.each_constituent do |constituent|
|
198
|
+
constituent.set :head,
|
199
|
+
root.find(id_table[constituent.head])
|
200
|
+
constituent.set :sem_head,
|
201
|
+
root.find(id_table[constituent.sem_head])
|
202
|
+
end
|
203
|
+
end
|
204
|
+
# Remove the period we added at the end.
|
205
|
+
if remove_last
|
206
|
+
last = current_element.punctuations[-1]
|
207
|
+
current_element.remove!(last)
|
208
|
+
end
|
209
|
+
current_element
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Parsers
|
4
|
+
class Stanford
|
5
|
+
# Require the Ruby-Java bridge.
|
6
|
+
silently { require 'rjb' }
|
7
|
+
jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
|
8
|
+
unless File.readable?(jar)
|
9
|
+
raise "Could not find stanford parser JAR file in #{jar}."+
|
10
|
+
" You may need to set Treat.bin to a custom value."
|
11
|
+
end
|
12
|
+
Rjb::load(jar, ['-Xms256M', '-Xmx512M'])
|
13
|
+
LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
|
14
|
+
@@parsers = {}
|
15
|
+
def self.parse(entity, options = {})
|
16
|
+
lang = Treat::Resources::Languages.describe(entity.language).to_s
|
17
|
+
pcfg = "#{Treat.bin}/stanford_parser/grammar/#{lang.upcase}PCFG.ser.gz"
|
18
|
+
unless File.readable?(pcfg)
|
19
|
+
raise "Could not find a language model for #{lang}: looking in #{pcfg}."
|
20
|
+
end
|
21
|
+
@@parsers[lang] ||= LexicalizedParser.new(pcfg) # Fix - check that exists.
|
22
|
+
parse = @@parsers[lang].apply(entity.to_s)
|
23
|
+
entity.remove_all!
|
24
|
+
recurse(parse, entity)
|
25
|
+
entity
|
26
|
+
end
|
27
|
+
def self.recurse(java_node, ruby_node)
|
28
|
+
# Leaf
|
29
|
+
if java_node.num_children == 0
|
30
|
+
ruby_child = Treat::Entities::Entity.from_string(java_node.value)
|
31
|
+
labels = java_node.labels.iterator
|
32
|
+
while labels.has_next
|
33
|
+
label = labels.next
|
34
|
+
ruby_child.set :begin_char, label.begin_position
|
35
|
+
ruby_child.set :end_char, label.end_position
|
36
|
+
ruby_child.set :tag, ruby_node.tag
|
37
|
+
end
|
38
|
+
ruby_node << ruby_child
|
39
|
+
else
|
40
|
+
if java_node.num_children == 1
|
41
|
+
return recurse(java_node.children[0], ruby_node)
|
42
|
+
end
|
43
|
+
java_node.children.each do |java_child|
|
44
|
+
dependencies = java_child.dependencies.iterator
|
45
|
+
# while dependencies.has_next
|
46
|
+
#dependency = dependencies.next
|
47
|
+
# end
|
48
|
+
ruby_child = Treat::Entities::Phrase.new
|
49
|
+
ruby_child.set :tag, java_child.value
|
50
|
+
ruby_node << ruby_child
|
51
|
+
unless java_child.children.empty?
|
52
|
+
recurse(java_child, ruby_child)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Segmenters
|
4
|
+
# An adapter for the 'punk-segmenter' gem, which segments
|
5
|
+
# texts into sentences based on an unsupervised, language
|
6
|
+
# independent algorithm.
|
7
|
+
#
|
8
|
+
# Original paper: Kiss, Tibor and Strunk, Jan (2006):
|
9
|
+
# Unsupervised Multilingual Sentence Boundary Detection.
|
10
|
+
# Computational Linguistics 32: 485-525.
|
11
|
+
class Punkt
|
12
|
+
silently { require 'punkt-segmenter' }
|
13
|
+
# Hold one copy of the segmenter per language.
|
14
|
+
@@segmenters = {}
|
15
|
+
# Hold only one trainer per language.
|
16
|
+
@@trainers = {}
|
17
|
+
# Texts to train the segmenter on.
|
18
|
+
@@training_texts = {
|
19
|
+
eng: "A minute is a unit of measurement of time or of angle. The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1. In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second. The minute is not an SI unit; however, it is accepted for use with SI units. The symbol for minute or minutes is min. The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system. Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length."
|
20
|
+
}
|
21
|
+
# Segment a text using the Punkt segmenter gem.
|
22
|
+
#
|
23
|
+
# Options:
|
24
|
+
# :training_text => (String) Text to train the segmenter on.
|
25
|
+
def self.segment(entity, options = {})
|
26
|
+
lang = entity.language
|
27
|
+
training_text = options[:training_text] ?
|
28
|
+
options[:training_text] : @@training_texts[lang]
|
29
|
+
unless training_text
|
30
|
+
raise "No training text available for language #{lang}."
|
31
|
+
end
|
32
|
+
if @@trainers[lang].nil?
|
33
|
+
@@trainers[lang] = ::Punkt::Trainer.new
|
34
|
+
@@trainers[lang].train(training_text)
|
35
|
+
@@segmenters[lang] =
|
36
|
+
::Punkt::SentenceTokenizer.new(@@trainers[lang].parameters)
|
37
|
+
end
|
38
|
+
result = @@segmenters[lang].sentences_from_text(entity.to_s,
|
39
|
+
:output => :sentences_text)
|
40
|
+
result.each do |sentence|
|
41
|
+
entity << Entities::Entity.from_string(sentence)
|
42
|
+
end
|
43
|
+
entity
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|