proiel 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +99 -0
- data/bin/console +6 -0
- data/bin/setup +5 -0
- data/lib/proiel/annotation_schema.rb +127 -0
- data/lib/proiel/citations.rb +84 -0
- data/lib/proiel/div.rb +133 -0
- data/lib/proiel/positional_tag.rb +127 -0
- data/lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd +172 -0
- data/lib/proiel/proiel_xml/proiel-1.0/teilite.xsd +7387 -0
- data/lib/proiel/proiel_xml/proiel-1.0/xml.xsd +287 -0
- data/lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd +185 -0
- data/lib/proiel/proiel_xml/reader.rb +237 -0
- data/lib/proiel/proiel_xml/schema.rb +81 -0
- data/lib/proiel/proiel_xml/validator.rb +177 -0
- data/lib/proiel/sentence.rb +191 -0
- data/lib/proiel/source.rb +114 -0
- data/lib/proiel/statistics.rb +41 -0
- data/lib/proiel/token.rb +407 -0
- data/lib/proiel/tokenization.rb +90 -0
- data/lib/proiel/treebank.rb +214 -0
- data/lib/proiel/treebank_object.rb +21 -0
- data/lib/proiel/version.rb +9 -0
- data/lib/proiel.rb +28 -0
- metadata +210 -0
@@ -0,0 +1,237 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module PROIELXML
|
8
|
+
# @api private
|
9
|
+
module Reader
|
10
|
+
# Parsing class for `slash` elements.
|
11
|
+
class Slash
|
12
|
+
include SAXMachine
|
13
|
+
|
14
|
+
attribute :'target-id', as: :target_id, class: Integer, required: true
|
15
|
+
attribute :relation, required: true
|
16
|
+
end
|
17
|
+
|
18
|
+
# Parsing class for `token` elements.
|
19
|
+
class Token
|
20
|
+
include SAXMachine
|
21
|
+
|
22
|
+
attribute :id, class: Integer, required: true
|
23
|
+
attribute :'head-id', as: :head_id, class: Integer
|
24
|
+
attribute :form
|
25
|
+
attribute :lemma
|
26
|
+
attribute :'part-of-speech', as: :part_of_speech
|
27
|
+
attribute :morphology
|
28
|
+
attribute :relation
|
29
|
+
attribute :'empty-token-sort', as: :empty_token_sort
|
30
|
+
attribute :'citation-part', as: :citation_part
|
31
|
+
attribute :'presentation-before', as: :presentation_before
|
32
|
+
attribute :'presentation-after', as: :presentation_after
|
33
|
+
attribute :'antecedent-id', as: :antecedent_id, class: Integer
|
34
|
+
attribute :'information-status', as: :information_status
|
35
|
+
attribute :'contrast-group', as: :contrast_group
|
36
|
+
attribute :'foreign-ids', as: :foreign_ids
|
37
|
+
|
38
|
+
elements :slash, as: :slashes, class: Slash
|
39
|
+
end
|
40
|
+
|
41
|
+
# Parsing class for `sentence` elements.
|
42
|
+
class Sentence
|
43
|
+
include SAXMachine
|
44
|
+
|
45
|
+
attribute :id, class: Integer, required: true
|
46
|
+
attribute :status, class: Symbol, default: :unannotated
|
47
|
+
attribute :'presentation-before', as: :presentation_before
|
48
|
+
attribute :'presentation-after', as: :presentation_after
|
49
|
+
|
50
|
+
elements :token, as: :tokens, class: Token
|
51
|
+
end
|
52
|
+
|
53
|
+
# Parsing class for `div` elements.
|
54
|
+
class Div
|
55
|
+
include SAXMachine
|
56
|
+
|
57
|
+
attribute :id
|
58
|
+
attribute :'presentation-before', as: :presentation_before
|
59
|
+
attribute :'presentation-after', as: :presentation_after
|
60
|
+
|
61
|
+
element :title
|
62
|
+
elements :sentence, as: :sentences, class: Sentence
|
63
|
+
end
|
64
|
+
|
65
|
+
# Parsing class for `source` elements.
|
66
|
+
class Source
|
67
|
+
include SAXMachine
|
68
|
+
|
69
|
+
attribute :id, required: true
|
70
|
+
attribute :language, required: true
|
71
|
+
|
72
|
+
element :title
|
73
|
+
element :author
|
74
|
+
element :citation_part
|
75
|
+
element :principal
|
76
|
+
element :funder
|
77
|
+
element :distributor
|
78
|
+
element :distributor_address
|
79
|
+
element :date
|
80
|
+
element :license
|
81
|
+
element :license_url
|
82
|
+
element :reference_system
|
83
|
+
element :editor
|
84
|
+
element :editorial_note
|
85
|
+
element :annotator
|
86
|
+
element :reviewer
|
87
|
+
element :electronic_text_editor
|
88
|
+
element :electronic_text_title
|
89
|
+
element :electronic_text_version
|
90
|
+
element :electronic_text_publisher
|
91
|
+
element :electronic_text_place
|
92
|
+
element :electronic_text_date
|
93
|
+
element :electronic_text_original_url
|
94
|
+
element :electronic_text_license
|
95
|
+
element :electronic_text_license_url
|
96
|
+
element :printed_text_editor
|
97
|
+
element :printed_text_title
|
98
|
+
element :printed_text_edition
|
99
|
+
element :printed_text_publisher
|
100
|
+
element :printed_text_place
|
101
|
+
element :printed_text_date
|
102
|
+
elements :div, as: :divs, class: Div
|
103
|
+
end
|
104
|
+
|
105
|
+
# Parsing class for `relations/value` elements.
|
106
|
+
class RelationValue
|
107
|
+
include SAXMachine
|
108
|
+
|
109
|
+
attribute :tag, required: true
|
110
|
+
attribute :summary, required: true
|
111
|
+
attribute :primary, required: true
|
112
|
+
attribute :secondary, required: true
|
113
|
+
end
|
114
|
+
|
115
|
+
# Parsing class for `relations` elements.
|
116
|
+
class Relations
|
117
|
+
include SAXMachine
|
118
|
+
|
119
|
+
elements :value, as: :values, class: RelationValue
|
120
|
+
end
|
121
|
+
|
122
|
+
# Parsing class for `parts_of_speech/value` elements.
|
123
|
+
class PartOfSpeechValue
|
124
|
+
include SAXMachine
|
125
|
+
|
126
|
+
attribute :tag, required: true
|
127
|
+
attribute :summary, required: true
|
128
|
+
end
|
129
|
+
|
130
|
+
# Parsing class for `parts_of_speech` elements.
|
131
|
+
class PartsOfSpeech
|
132
|
+
include SAXMachine
|
133
|
+
|
134
|
+
elements :value, as: :values, class: PartOfSpeechValue
|
135
|
+
end
|
136
|
+
|
137
|
+
# Parsing class for `morphology/field/value` elements.
|
138
|
+
class MorphologyValue
|
139
|
+
include SAXMachine
|
140
|
+
|
141
|
+
attribute :tag, required: true
|
142
|
+
attribute :summary, required: true
|
143
|
+
end
|
144
|
+
|
145
|
+
# Parsing class for `morphology/field` elements.
|
146
|
+
class MorphologyField
|
147
|
+
include SAXMachine
|
148
|
+
|
149
|
+
attribute :tag, required: true
|
150
|
+
|
151
|
+
elements :value, as: :values, class: MorphologyValue
|
152
|
+
end
|
153
|
+
|
154
|
+
# Parsing class for `morphology` elements.
|
155
|
+
class Morphology
|
156
|
+
include SAXMachine
|
157
|
+
|
158
|
+
elements :field, as: :fields, class: MorphologyField
|
159
|
+
end
|
160
|
+
|
161
|
+
# Parsing class for `information_statuses/value` elements.
|
162
|
+
class InformationStatusValue
|
163
|
+
include SAXMachine
|
164
|
+
|
165
|
+
attribute :tag, required: true
|
166
|
+
attribute :summary, required: true
|
167
|
+
end
|
168
|
+
|
169
|
+
# Parsing class for `information_statuses` elements.
|
170
|
+
class InformationStatuses
|
171
|
+
include SAXMachine
|
172
|
+
|
173
|
+
elements :value, as: :values, class: InformationStatusValue
|
174
|
+
end
|
175
|
+
|
176
|
+
# Parsing class for `annotation` elements.
|
177
|
+
class Annotation
|
178
|
+
include SAXMachine
|
179
|
+
|
180
|
+
element :relations, class: Relations
|
181
|
+
element :parts_of_speech, as: :parts_of_speech, class: PartsOfSpeech
|
182
|
+
element :morphology, class: Morphology
|
183
|
+
element :information_statuses, as: :information_statuses, class: InformationStatuses
|
184
|
+
end
|
185
|
+
|
186
|
+
# Parsing class for `proiel` elements.
|
187
|
+
class Proiel
|
188
|
+
include SAXMachine
|
189
|
+
|
190
|
+
attribute :'export-time', as: :export_time
|
191
|
+
attribute :'schema-version', as: :schema_version, required: true
|
192
|
+
|
193
|
+
elements :source, as: :sources, class: Source
|
194
|
+
element :annotation, class: Annotation
|
195
|
+
end
|
196
|
+
|
197
|
+
# Top-level parsing class for a PROIEL XML file.
|
198
|
+
class TreebankFile
|
199
|
+
include SAXMachine
|
200
|
+
|
201
|
+
element :proiel, class: Proiel
|
202
|
+
end
|
203
|
+
|
204
|
+
# Parses PROIEL XML data.
|
205
|
+
#
|
206
|
+
# This does not automatically validate the PROIEL XML. If given an
|
207
|
+
# invalid PROIEL XML file, parsing is likely to succeed but the returned
|
208
|
+
# objects will be in an inconsistent state.
|
209
|
+
#
|
210
|
+
# @see parse_io
|
211
|
+
#
|
212
|
+
# @param xml [String] PROIEL XML to parse
|
213
|
+
#
|
214
|
+
# @return [TreebankFile]
|
215
|
+
#
|
216
|
+
def self.parse_xml(xml)
|
217
|
+
TreebankFile.parse(xml)
|
218
|
+
end
|
219
|
+
|
220
|
+
# Parses a PROIEL XML file.
|
221
|
+
#
|
222
|
+
# This does not automatically validate the PROIEL XML. If given an
|
223
|
+
# invalid PROIEL XML file, parsing is likely to succeed but the returned
|
224
|
+
# objects will be in an inconsistent state.
|
225
|
+
#
|
226
|
+
# @see parse_xml
|
227
|
+
#
|
228
|
+
# @param io [IO] stream representing the PROIEL XML file
|
229
|
+
#
|
230
|
+
# @return [TreebankFile]
|
231
|
+
#
|
232
|
+
def self.parse_io(io)
|
233
|
+
parse_xml(io.read)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module PROIELXML
|
8
|
+
# Functionality concerned with PROIEL XML schema loading and versioning.
|
9
|
+
# Functionality for validation using a PROIEL XML schema is found in
|
10
|
+
# {PROIEL::PROIELXML::Validator}.
|
11
|
+
#
|
12
|
+
# @api private
|
13
|
+
module Schema
|
14
|
+
# Returns the current version of the PROIEL XML schema.
|
15
|
+
#
|
16
|
+
# @return [String] schema version number
|
17
|
+
#
|
18
|
+
def self.current_proiel_xml_schema_version
|
19
|
+
'2.0'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Invalid PROIEL XML schema version error.
|
23
|
+
#
|
24
|
+
# This represents an error that occurs when an unknown PROIEL XML schema
|
25
|
+
# version number is encountered or one that could not be parsed.
|
26
|
+
class InvalidSchemaVersion < RuntimeError; end
|
27
|
+
|
28
|
+
# Opens a PROIEL XML schema file and peek at the schema version number
|
29
|
+
# that the file claims it conforms to.
|
30
|
+
#
|
31
|
+
# @return [String] schema version number
|
32
|
+
#
|
33
|
+
# @raise InvalidSchemaVersion
|
34
|
+
#
|
35
|
+
def self.check_schema_version_of_xml_file(filename)
|
36
|
+
doc = Nokogiri::XML(File.read(filename))
|
37
|
+
|
38
|
+
if doc and doc.root and doc.root.name == 'proiel'
|
39
|
+
case doc.root.attr('schema-version')
|
40
|
+
when '2.0'
|
41
|
+
'2.0'
|
42
|
+
when NilClass
|
43
|
+
'1.0'
|
44
|
+
else
|
45
|
+
raise InvalidSchemaVersion, 'invalid schema version number'
|
46
|
+
end
|
47
|
+
else
|
48
|
+
raise InvalidSchemaVersion, 'top-level XML element not found'
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Loads a PROIEL XML schema.
|
53
|
+
#
|
54
|
+
# @return [Nokogiri::XML::Schema] schema version number
|
55
|
+
#
|
56
|
+
# @raise RuntimeError
|
57
|
+
#
|
58
|
+
def self.load_proiel_xml_schema(schema_version)
|
59
|
+
filename = proiel_xml_schema_filename(schema_version)
|
60
|
+
|
61
|
+
Nokogiri::XML::Schema(File.open(filename).read)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Determines the filename of a specific version of the PROIEL XML schema.
|
65
|
+
#
|
66
|
+
# @return [String] filename
|
67
|
+
#
|
68
|
+
# @raise ArgumentError
|
69
|
+
#
|
70
|
+
def self.proiel_xml_schema_filename(schema_version)
|
71
|
+
if schema_version == '1.0' or schema_version == '2.0'
|
72
|
+
File.join(File.dirname(__FILE__),
|
73
|
+
"proiel-#{schema_version}",
|
74
|
+
"proiel-#{schema_version}.xsd")
|
75
|
+
else
|
76
|
+
raise ArgumentError, 'invalid schema version'
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module PROIELXML
|
8
|
+
# A validator object that uses an XML schema as well as additional
|
9
|
+
# integrity checks to validate a PROIEL XML file. Functionality for
|
10
|
+
# loading the XML schema and checking the PROIEL XML version number is
|
11
|
+
# found in {PROIEL::PROIELXML::Schema}.
|
12
|
+
class Validator
|
13
|
+
# Returns an array of error messages generated during validation.
|
14
|
+
attr_reader :errors
|
15
|
+
|
16
|
+
# Creates a new validator for a PROIEL XML file.
|
17
|
+
#
|
18
|
+
# @param filename [String] name of PROIEL XML file to validate
|
19
|
+
#
|
20
|
+
def initialize(filename)
|
21
|
+
@filename = filename
|
22
|
+
@errors = []
|
23
|
+
end
|
24
|
+
|
25
|
+
# Checks if the PROIEL XML file is valid. This checks for
|
26
|
+
# well-formedness, a valid schema version, validation against the schema
|
27
|
+
# and referential integrity.
|
28
|
+
#
|
29
|
+
# If invalid, `errors` will contain error messages.
|
30
|
+
#
|
31
|
+
# @return [true, false]
|
32
|
+
#
|
33
|
+
def valid?
|
34
|
+
wellformed? and valid_schema_version? and validates? and has_referential_integrity?
|
35
|
+
end
|
36
|
+
|
37
|
+
# Checks if the PROIEL XML file is well-formed XML.
|
38
|
+
#
|
39
|
+
# If not well-formed, an error message will be appended to `errors`.
|
40
|
+
#
|
41
|
+
# @return [true, false]
|
42
|
+
#
|
43
|
+
def wellformed?
|
44
|
+
begin
|
45
|
+
Nokogiri::XML(File.read(@filename)) { |config| config.strict }
|
46
|
+
|
47
|
+
true
|
48
|
+
rescue Nokogiri::XML::SyntaxError => _
|
49
|
+
@errors << 'XML file is not wellformed'
|
50
|
+
|
51
|
+
false
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Checks if the PROIEL XML file has a valid schema version number.
|
56
|
+
#
|
57
|
+
# If invalid, an error message will be appended to `errors`.
|
58
|
+
#
|
59
|
+
# @return [true, false]
|
60
|
+
#
|
61
|
+
def valid_schema_version?
|
62
|
+
schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)
|
63
|
+
|
64
|
+
if schema_version.nil?
|
65
|
+
@errors << 'invalid schema version'
|
66
|
+
|
67
|
+
false
|
68
|
+
else
|
69
|
+
true
|
70
|
+
end
|
71
|
+
rescue PROIEL::PROIELXML::Schema::InvalidSchemaVersion => e
|
72
|
+
@errors << e.message
|
73
|
+
|
74
|
+
false
|
75
|
+
end
|
76
|
+
|
77
|
+
# Checks if the PROIEL XML file validates against the schema.
|
78
|
+
#
|
79
|
+
# If invalid, error messages will be appended to `errors`.
|
80
|
+
#
|
81
|
+
# @return [true, false]
|
82
|
+
#
|
83
|
+
def validates?
|
84
|
+
doc = Nokogiri::XML(File.read(@filename))
|
85
|
+
|
86
|
+
schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)
|
87
|
+
|
88
|
+
schema = PROIEL::PROIELXML::Schema.load_proiel_xml_schema(schema_version)
|
89
|
+
r = schema.validate(doc)
|
90
|
+
|
91
|
+
if r.empty?
|
92
|
+
true
|
93
|
+
else
|
94
|
+
@errors += r.map { |e| "Line #{e.line}: #{e.message}" }
|
95
|
+
|
96
|
+
false
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Checks the referential integrity of the PROIEL XML file.
|
101
|
+
#
|
102
|
+
# If inconsistencies are found, error messages will be appended to `errors`.
|
103
|
+
#
|
104
|
+
# @return [true, false]
|
105
|
+
#
|
106
|
+
def has_referential_integrity?
|
107
|
+
tb = PROIEL::Treebank.new
|
108
|
+
tb.load_from_xml(@filename)
|
109
|
+
|
110
|
+
errors = []
|
111
|
+
|
112
|
+
# Pass 1: keep track of all object IDs and look for duplicates
|
113
|
+
sentence_ids = {}
|
114
|
+
token_ids = {}
|
115
|
+
|
116
|
+
tb.sources.each do |source|
|
117
|
+
source.divs.each do |div|
|
118
|
+
div.sentences.each do |sentence|
|
119
|
+
errors << "Repeated sentence ID #{sentence.id}" if sentence_ids.key?(sentence.id)
|
120
|
+
sentence_ids[sentence.id] = true
|
121
|
+
|
122
|
+
sentence.tokens.each do |token|
|
123
|
+
errors << "Repeated token ID #{token.id}" if token_ids.key?(token.id)
|
124
|
+
token_ids[token.id] = { sentence: sentence.id, div: div.id, source: source.id }
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Pass 2: check object ID references
|
131
|
+
tb.sources.each do |source|
|
132
|
+
source.tokens.each do |token|
|
133
|
+
# Head IDs and slash IDs should be sentence internal
|
134
|
+
check_reference_locality(errors, token, token_ids, :head_id, token.head_id, domain: :sentence, allow_nil: true)
|
135
|
+
|
136
|
+
token.slashes.each do |_, target_id|
|
137
|
+
check_reference_locality(errors, token, token_ids, :slash_id, target_id, domain: :sentence, allow_nil: false)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Antecedent IDs should be source internal
|
141
|
+
check_reference_locality(errors, token, token_ids, :antecedent_id, token.antecedent_id, domain: :source, allow_nil: true)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Pass 3: verify that all features are defined
|
146
|
+
# TBD
|
147
|
+
|
148
|
+
if errors.empty?
|
149
|
+
true
|
150
|
+
else
|
151
|
+
@errors += errors
|
152
|
+
|
153
|
+
false
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
private
|
158
|
+
|
159
|
+
def check_reference_locality(errors, token, token_ids, attribute_name,
|
160
|
+
attribute_value, domain: :sentence, allow_nil: false)
|
161
|
+
if attribute_value
|
162
|
+
referenced_token = token_ids[attribute_value]
|
163
|
+
|
164
|
+
if referenced_token.nil?
|
165
|
+
errors << "Token #{token.id}: #{attribute_name} references an unknown token"
|
166
|
+
elsif referenced_token[domain] != token.send(domain).id
|
167
|
+
errors << "Token #{token.id}: #{attribute_name} references a token in a different #{domain}"
|
168
|
+
end
|
169
|
+
elsif allow_nil
|
170
|
+
# Everything is fine...
|
171
|
+
else
|
172
|
+
errors << "Token #{token.id}: #{attribute_name} is null"
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,191 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
# A sentence object in a treebank.
|
8
|
+
class Sentence < TreebankObject
|
9
|
+
extend Memoist
|
10
|
+
|
11
|
+
# @return [Fixnum] ID of the sentence
|
12
|
+
attr_reader :id
|
13
|
+
|
14
|
+
# @return [Div] parent div object
|
15
|
+
attr_reader :div
|
16
|
+
|
17
|
+
# @return [Symbol] annotation status of sentence
|
18
|
+
attr_reader :status
|
19
|
+
|
20
|
+
# @return [nil, String] presentation material before sentence
|
21
|
+
attr_reader :presentation_before
|
22
|
+
|
23
|
+
# @return [nil, String] presentation material after sentence
|
24
|
+
attr_reader :presentation_after
|
25
|
+
|
26
|
+
# Creates a new sentence object.
|
27
|
+
def initialize(parent, id, status, presentation_before, presentation_after, &block)
|
28
|
+
@div = parent
|
29
|
+
|
30
|
+
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
31
|
+
@id = id
|
32
|
+
|
33
|
+
raise ArgumentError, 'string or symbol expected' unless status.is_a?(String) or status.is_a?(Symbol)
|
34
|
+
@status = status.to_sym
|
35
|
+
|
36
|
+
raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
|
37
|
+
@presentation_before = presentation_before.freeze
|
38
|
+
|
39
|
+
raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
|
40
|
+
@presentation_after = presentation_after.freeze
|
41
|
+
|
42
|
+
@children = block.call(self) if block_given?
|
43
|
+
end
|
44
|
+
|
45
|
+
# @return [Source] parent source object
|
46
|
+
def source
|
47
|
+
@div.source
|
48
|
+
end
|
49
|
+
|
50
|
+
# @return [Treebank] parent treebank object
|
51
|
+
def treebank
|
52
|
+
@div.source.treebank
|
53
|
+
end
|
54
|
+
|
55
|
+
# @return [String] language of the sentence as an ISO 639-3 language tag
|
56
|
+
def language
|
57
|
+
source.language
|
58
|
+
end
|
59
|
+
|
60
|
+
memoize :language
|
61
|
+
|
62
|
+
# @return [String] the complete citation for the sentence
|
63
|
+
def citation
|
64
|
+
[source.citation_part, citation_part].join(' ')
|
65
|
+
end
|
66
|
+
|
67
|
+
# Computes an appropriate citation component for the sentence.
|
68
|
+
#
|
69
|
+
# The computed citation component must be concatenated with the citation
|
70
|
+
# component provided by the source to produce a complete citation.
|
71
|
+
#
|
72
|
+
# @see citation
|
73
|
+
#
|
74
|
+
# @return [String] the citation component
|
75
|
+
def citation_part
|
76
|
+
tc = @children.select(&:has_citation?)
|
77
|
+
x = tc.first ? tc.first.citation_part : nil
|
78
|
+
y = tc.last ? tc.last.citation_part : nil
|
79
|
+
|
80
|
+
Citations.citation_make_range(x, y)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns the printable form of the sentence with all token forms and any
|
84
|
+
# presentation data.
|
85
|
+
#
|
86
|
+
# @return [String] the printable form of the sentence
|
87
|
+
def printable_form(options = {})
|
88
|
+
[presentation_before,
|
89
|
+
@children.map { |t| t.printable_form(options) },
|
90
|
+
presentation_after].compact.join
|
91
|
+
end
|
92
|
+
|
93
|
+
# Checks if the sentence is reviewed.
|
94
|
+
#
|
95
|
+
# A sentence has been reviewed if its `status` is `:reviewed`.
|
96
|
+
#
|
97
|
+
# @return [true,false]
|
98
|
+
def reviewed?
|
99
|
+
@status == :reviewed
|
100
|
+
end
|
101
|
+
|
102
|
+
# Checks if the sentence is annotated.
|
103
|
+
#
|
104
|
+
# Since only annotated sentences can be reviewed, a sentence is annotated
|
105
|
+
# if its `status` is either `:reviewed` or `:annotated`.
|
106
|
+
#
|
107
|
+
# @return [true,false]
|
108
|
+
def annotated?
|
109
|
+
@status == :reviewed or @status == :annotated
|
110
|
+
end
|
111
|
+
|
112
|
+
# Checks if the sentence is unannotated.
|
113
|
+
#
|
114
|
+
# A sentence is unannotated if its `status` is `:unannotated`.
|
115
|
+
#
|
116
|
+
# @return [true,false]
|
117
|
+
def unannotated?
|
118
|
+
@status == :unannotated
|
119
|
+
end
|
120
|
+
|
121
|
+
# Builds a syntax graph for the dependency annotation of the sentence and
|
122
|
+
# inserts a dummy root node. The graph is represented as a hash of
|
123
|
+
# hashes. Each hash contains the ID of the token, its relation (to its
|
124
|
+
# syntatically dominating token) and a list of secondary edges.
|
125
|
+
#
|
126
|
+
# @return [Hash] a single graph with a dummy root node represented as a hash
|
127
|
+
#
|
128
|
+
# @example
|
129
|
+
#
|
130
|
+
# sentence.syntax_graph # => [id: nil, relation: nil, children: [{ id: 1000, relation: "pred", children: [ { id: 1001, relation: "xcomp", children: [], slashes: [["xsub", 1000]]}]}], slashes: []]
|
131
|
+
#
|
132
|
+
def syntax_graph
|
133
|
+
{ id: nil, relation: nil, children: syntax_graphs, slashes: [] }
|
134
|
+
end
|
135
|
+
|
136
|
+
# Builds syntax graphs for the dependency annotation of the sentence.
|
137
|
+
# Multiple graphs may be returned as the function does not insert an
|
138
|
+
# empty dummy root node. Each graph is represented as a hash of hashes.
|
139
|
+
# Each hash contains the ID of the token, its relation (to its
|
140
|
+
# syntatically dominating token) and a list of secondary edges.
|
141
|
+
#
|
142
|
+
# @return [Array] zero or more syntax graphs represented as hashes
|
143
|
+
#
|
144
|
+
# @example Get a single syntax graph with a dummy root node
|
145
|
+
#
|
146
|
+
# sentence.syntax_graphs # => [{ id: 1000, relation: "pred", children: [ { id: 1001, relation: "xcomp", children: [], slashes: [["xsub", 1000]]}]}]
|
147
|
+
#
|
148
|
+
def syntax_graphs
|
149
|
+
Array.new.tap do |graphs|
|
150
|
+
token_map = {}
|
151
|
+
|
152
|
+
# Pass 1: create new attribute hashes for each token and index each hash by token ID
|
153
|
+
@children.each do |token|
|
154
|
+
token_map[token.id] =
|
155
|
+
{
|
156
|
+
id: token.id,
|
157
|
+
relation: token.relation,
|
158
|
+
children: [],
|
159
|
+
slashes: token.slashes,
|
160
|
+
}
|
161
|
+
end
|
162
|
+
|
163
|
+
# Pass 2: append attribute hashes for tokens with a head ID to the head's children list; append attribute hashes for tokens without a head ID to the list of graphs to return
|
164
|
+
@children.each do |token|
|
165
|
+
if token.head_id
|
166
|
+
token_map[token.head_id][:children] << token_map[token.id]
|
167
|
+
else
|
168
|
+
graphs << token_map[token.id]
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# Finds all tokens in the sentence.
|
175
|
+
#
|
176
|
+
# @return [Enumerator] tokens in the sentence
|
177
|
+
#
|
178
|
+
# @example Iterating tokens
|
179
|
+
# tokens.each { |t| puts t.id }
|
180
|
+
#
|
181
|
+
# @example Create an array with only empty tokens
|
182
|
+
# tokens.select(&:is_empty?)
|
183
|
+
#
|
184
|
+
# @example Counting tokens
|
185
|
+
# puts tokens.count #=> 200
|
186
|
+
#
|
187
|
+
def tokens
|
188
|
+
@children.to_enum
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|