proiel 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +99 -0
- data/bin/console +6 -0
- data/bin/setup +5 -0
- data/lib/proiel/annotation_schema.rb +127 -0
- data/lib/proiel/citations.rb +84 -0
- data/lib/proiel/div.rb +133 -0
- data/lib/proiel/positional_tag.rb +127 -0
- data/lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd +172 -0
- data/lib/proiel/proiel_xml/proiel-1.0/teilite.xsd +7387 -0
- data/lib/proiel/proiel_xml/proiel-1.0/xml.xsd +287 -0
- data/lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd +185 -0
- data/lib/proiel/proiel_xml/reader.rb +237 -0
- data/lib/proiel/proiel_xml/schema.rb +81 -0
- data/lib/proiel/proiel_xml/validator.rb +177 -0
- data/lib/proiel/sentence.rb +191 -0
- data/lib/proiel/source.rb +114 -0
- data/lib/proiel/statistics.rb +41 -0
- data/lib/proiel/token.rb +407 -0
- data/lib/proiel/tokenization.rb +90 -0
- data/lib/proiel/treebank.rb +214 -0
- data/lib/proiel/treebank_object.rb +21 -0
- data/lib/proiel/version.rb +9 -0
- data/lib/proiel.rb +28 -0
- metadata +210 -0
@@ -0,0 +1,237 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module PROIELXML
|
8
|
+
# @api private
|
9
|
+
module Reader
|
10
|
+
# Parsing class for `slash` elements.
|
11
|
+
class Slash
|
12
|
+
include SAXMachine
|
13
|
+
|
14
|
+
attribute :'target-id', as: :target_id, class: Integer, required: true
|
15
|
+
attribute :relation, required: true
|
16
|
+
end
|
17
|
+
|
18
|
+
# Parsing class for `token` elements.
|
19
|
+
class Token
|
20
|
+
include SAXMachine
|
21
|
+
|
22
|
+
attribute :id, class: Integer, required: true
|
23
|
+
attribute :'head-id', as: :head_id, class: Integer
|
24
|
+
attribute :form
|
25
|
+
attribute :lemma
|
26
|
+
attribute :'part-of-speech', as: :part_of_speech
|
27
|
+
attribute :morphology
|
28
|
+
attribute :relation
|
29
|
+
attribute :'empty-token-sort', as: :empty_token_sort
|
30
|
+
attribute :'citation-part', as: :citation_part
|
31
|
+
attribute :'presentation-before', as: :presentation_before
|
32
|
+
attribute :'presentation-after', as: :presentation_after
|
33
|
+
attribute :'antecedent-id', as: :antecedent_id, class: Integer
|
34
|
+
attribute :'information-status', as: :information_status
|
35
|
+
attribute :'contrast-group', as: :contrast_group
|
36
|
+
attribute :'foreign-ids', as: :foreign_ids
|
37
|
+
|
38
|
+
elements :slash, as: :slashes, class: Slash
|
39
|
+
end
|
40
|
+
|
41
|
+
# Parsing class for `sentence` elements.
|
42
|
+
class Sentence
|
43
|
+
include SAXMachine
|
44
|
+
|
45
|
+
attribute :id, class: Integer, required: true
|
46
|
+
attribute :status, class: Symbol, default: :unannotated
|
47
|
+
attribute :'presentation-before', as: :presentation_before
|
48
|
+
attribute :'presentation-after', as: :presentation_after
|
49
|
+
|
50
|
+
elements :token, as: :tokens, class: Token
|
51
|
+
end
|
52
|
+
|
53
|
+
# Parsing class for `div` elements.
|
54
|
+
class Div
|
55
|
+
include SAXMachine
|
56
|
+
|
57
|
+
attribute :id
|
58
|
+
attribute :'presentation-before', as: :presentation_before
|
59
|
+
attribute :'presentation-after', as: :presentation_after
|
60
|
+
|
61
|
+
element :title
|
62
|
+
elements :sentence, as: :sentences, class: Sentence
|
63
|
+
end
|
64
|
+
|
65
|
+
# Parsing class for `source` elements.
|
66
|
+
class Source
|
67
|
+
include SAXMachine
|
68
|
+
|
69
|
+
attribute :id, required: true
|
70
|
+
attribute :language, required: true
|
71
|
+
|
72
|
+
element :title
|
73
|
+
element :author
|
74
|
+
element :citation_part
|
75
|
+
element :principal
|
76
|
+
element :funder
|
77
|
+
element :distributor
|
78
|
+
element :distributor_address
|
79
|
+
element :date
|
80
|
+
element :license
|
81
|
+
element :license_url
|
82
|
+
element :reference_system
|
83
|
+
element :editor
|
84
|
+
element :editorial_note
|
85
|
+
element :annotator
|
86
|
+
element :reviewer
|
87
|
+
element :electronic_text_editor
|
88
|
+
element :electronic_text_title
|
89
|
+
element :electronic_text_version
|
90
|
+
element :electronic_text_publisher
|
91
|
+
element :electronic_text_place
|
92
|
+
element :electronic_text_date
|
93
|
+
element :electronic_text_original_url
|
94
|
+
element :electronic_text_license
|
95
|
+
element :electronic_text_license_url
|
96
|
+
element :printed_text_editor
|
97
|
+
element :printed_text_title
|
98
|
+
element :printed_text_edition
|
99
|
+
element :printed_text_publisher
|
100
|
+
element :printed_text_place
|
101
|
+
element :printed_text_date
|
102
|
+
elements :div, as: :divs, class: Div
|
103
|
+
end
|
104
|
+
|
105
|
+
# Parsing class for `relations/value` elements.
|
106
|
+
class RelationValue
|
107
|
+
include SAXMachine
|
108
|
+
|
109
|
+
attribute :tag, required: true
|
110
|
+
attribute :summary, required: true
|
111
|
+
attribute :primary, required: true
|
112
|
+
attribute :secondary, required: true
|
113
|
+
end
|
114
|
+
|
115
|
+
# Parsing class for `relations` elements.
|
116
|
+
class Relations
|
117
|
+
include SAXMachine
|
118
|
+
|
119
|
+
elements :value, as: :values, class: RelationValue
|
120
|
+
end
|
121
|
+
|
122
|
+
# Parsing class for `parts_of_speech/value` elements.
|
123
|
+
class PartOfSpeechValue
|
124
|
+
include SAXMachine
|
125
|
+
|
126
|
+
attribute :tag, required: true
|
127
|
+
attribute :summary, required: true
|
128
|
+
end
|
129
|
+
|
130
|
+
# Parsing class for `parts_of_speech` elements.
|
131
|
+
class PartsOfSpeech
|
132
|
+
include SAXMachine
|
133
|
+
|
134
|
+
elements :value, as: :values, class: PartOfSpeechValue
|
135
|
+
end
|
136
|
+
|
137
|
+
# Parsing class for `morphology/field/value` elements.
|
138
|
+
class MorphologyValue
|
139
|
+
include SAXMachine
|
140
|
+
|
141
|
+
attribute :tag, required: true
|
142
|
+
attribute :summary, required: true
|
143
|
+
end
|
144
|
+
|
145
|
+
# Parsing class for `morphology/field` elements.
|
146
|
+
class MorphologyField
|
147
|
+
include SAXMachine
|
148
|
+
|
149
|
+
attribute :tag, required: true
|
150
|
+
|
151
|
+
elements :value, as: :values, class: MorphologyValue
|
152
|
+
end
|
153
|
+
|
154
|
+
# Parsing class for `morphology` elements.
|
155
|
+
class Morphology
|
156
|
+
include SAXMachine
|
157
|
+
|
158
|
+
elements :field, as: :fields, class: MorphologyField
|
159
|
+
end
|
160
|
+
|
161
|
+
# Parsing class for `information_statuses/value` elements.
|
162
|
+
class InformationStatusValue
|
163
|
+
include SAXMachine
|
164
|
+
|
165
|
+
attribute :tag, required: true
|
166
|
+
attribute :summary, required: true
|
167
|
+
end
|
168
|
+
|
169
|
+
# Parsing class for `information_statuses` elements.
|
170
|
+
class InformationStatuses
|
171
|
+
include SAXMachine
|
172
|
+
|
173
|
+
elements :value, as: :values, class: InformationStatusValue
|
174
|
+
end
|
175
|
+
|
176
|
+
# Parsing class for `annotation` elements.
|
177
|
+
class Annotation
|
178
|
+
include SAXMachine
|
179
|
+
|
180
|
+
element :relations, class: Relations
|
181
|
+
element :parts_of_speech, as: :parts_of_speech, class: PartsOfSpeech
|
182
|
+
element :morphology, class: Morphology
|
183
|
+
element :information_statuses, as: :information_statuses, class: InformationStatuses
|
184
|
+
end
|
185
|
+
|
186
|
+
# Parsing class for `proiel` elements.
|
187
|
+
class Proiel
|
188
|
+
include SAXMachine
|
189
|
+
|
190
|
+
attribute :'export-time', as: :export_time
|
191
|
+
attribute :'schema-version', as: :schema_version, required: true
|
192
|
+
|
193
|
+
elements :source, as: :sources, class: Source
|
194
|
+
element :annotation, class: Annotation
|
195
|
+
end
|
196
|
+
|
197
|
+
# Top-level parsing class for a PROIEL XML file.
|
198
|
+
class TreebankFile
|
199
|
+
include SAXMachine
|
200
|
+
|
201
|
+
element :proiel, class: Proiel
|
202
|
+
end
|
203
|
+
|
204
|
+
# Parses PROIEL XML data.
|
205
|
+
#
|
206
|
+
# This does not automatically validate the PROIEL XML. If given an
|
207
|
+
# invalid PROIEL XML file, parsing is likely to succeed but the returned
|
208
|
+
# objects will be in an inconsistent state.
|
209
|
+
#
|
210
|
+
# @see parse_io
|
211
|
+
#
|
212
|
+
# @param xml [String] PROIEL XML to parse
|
213
|
+
#
|
214
|
+
# @return [TreebankFile]
|
215
|
+
#
|
216
|
+
def self.parse_xml(xml)
|
217
|
+
TreebankFile.parse(xml)
|
218
|
+
end
|
219
|
+
|
220
|
+
# Parses a PROIEL XML file.
|
221
|
+
#
|
222
|
+
# This does not automatically validate the PROIEL XML. If given an
|
223
|
+
# invalid PROIEL XML file, parsing is likely to succeed but the returned
|
224
|
+
# objects will be in an inconsistent state.
|
225
|
+
#
|
226
|
+
# @see parse_xml
|
227
|
+
#
|
228
|
+
# @param io [IO] stream representing the PROIEL XML file
|
229
|
+
#
|
230
|
+
# @return [TreebankFile]
|
231
|
+
#
|
232
|
+
def self.parse_io(io)
|
233
|
+
parse_xml(io.read)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module PROIELXML
|
8
|
+
# Functionality concerned with PROIEL XML schema loading and versioning.
|
9
|
+
# Functionality for validation using a PROIEL XML schema is found in
|
10
|
+
# {PROIEL::PROIELXML::Validator}.
|
11
|
+
#
|
12
|
+
# @api private
|
13
|
+
module Schema
|
14
|
+
# Returns the current version of the PROIEL XML schema.
|
15
|
+
#
|
16
|
+
# @return [String] schema version number
|
17
|
+
#
|
18
|
+
def self.current_proiel_xml_schema_version
|
19
|
+
'2.0'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Invalid PROIEL XML schema version error.
|
23
|
+
#
|
24
|
+
# This represents an error that occurs when an unknown PROIEL XML schema
|
25
|
+
# version number is encountered or one that could not be parsed.
|
26
|
+
class InvalidSchemaVersion < RuntimeError; end
|
27
|
+
|
28
|
+
# Opens a PROIEL XML schema file and peek at the schema version number
|
29
|
+
# that the file claims it conforms to.
|
30
|
+
#
|
31
|
+
# @return [String] schema version number
|
32
|
+
#
|
33
|
+
# @raise InvalidSchemaVersion
|
34
|
+
#
|
35
|
+
def self.check_schema_version_of_xml_file(filename)
|
36
|
+
doc = Nokogiri::XML(File.read(filename))
|
37
|
+
|
38
|
+
if doc and doc.root and doc.root.name == 'proiel'
|
39
|
+
case doc.root.attr('schema-version')
|
40
|
+
when '2.0'
|
41
|
+
'2.0'
|
42
|
+
when NilClass
|
43
|
+
'1.0'
|
44
|
+
else
|
45
|
+
raise InvalidSchemaVersion, 'invalid schema version number'
|
46
|
+
end
|
47
|
+
else
|
48
|
+
raise InvalidSchemaVersion, 'top-level XML element not found'
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Loads a PROIEL XML schema.
|
53
|
+
#
|
54
|
+
# @return [Nokogiri::XML::Schema] schema version number
|
55
|
+
#
|
56
|
+
# @raise RuntimeError
|
57
|
+
#
|
58
|
+
def self.load_proiel_xml_schema(schema_version)
|
59
|
+
filename = proiel_xml_schema_filename(schema_version)
|
60
|
+
|
61
|
+
Nokogiri::XML::Schema(File.open(filename).read)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Determines the filename of a specific version of the PROIEL XML schema.
|
65
|
+
#
|
66
|
+
# @return [String] filename
|
67
|
+
#
|
68
|
+
# @raise ArgumentError
|
69
|
+
#
|
70
|
+
def self.proiel_xml_schema_filename(schema_version)
|
71
|
+
if schema_version == '1.0' or schema_version == '2.0'
|
72
|
+
File.join(File.dirname(__FILE__),
|
73
|
+
"proiel-#{schema_version}",
|
74
|
+
"proiel-#{schema_version}.xsd")
|
75
|
+
else
|
76
|
+
raise ArgumentError, 'invalid schema version'
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module PROIELXML
|
8
|
+
# A validator object that uses an XML schema as well as additional
|
9
|
+
# integrity checks to validate a PROIEL XML file. Functionality for
|
10
|
+
# loading the XML schema and checking the PROIEL XML version number is
|
11
|
+
# found in {PROIEL::PROIELXML::Schema}.
|
12
|
+
class Validator
|
13
|
+
# Returns an array of error messages generated during validation.
|
14
|
+
attr_reader :errors
|
15
|
+
|
16
|
+
# Creates a new validator for a PROIEL XML file.
|
17
|
+
#
|
18
|
+
# @param filename [String] name of PROIEL XML file to validate
|
19
|
+
#
|
20
|
+
def initialize(filename)
|
21
|
+
@filename = filename
|
22
|
+
@errors = []
|
23
|
+
end
|
24
|
+
|
25
|
+
# Checks if the PROIEL XML file is valid. This checks for
|
26
|
+
# well-formedness, a valid schema version, validation against the schema
|
27
|
+
# and referential integrity.
|
28
|
+
#
|
29
|
+
# If invalid, `errors` will contain error messages.
|
30
|
+
#
|
31
|
+
# @return [true, false]
|
32
|
+
#
|
33
|
+
def valid?
|
34
|
+
wellformed? and valid_schema_version? and validates? and has_referential_integrity?
|
35
|
+
end
|
36
|
+
|
37
|
+
# Checks if the PROIEL XML file is well-formed XML.
|
38
|
+
#
|
39
|
+
# If not well-formed, an error message will be appended to `errors`.
|
40
|
+
#
|
41
|
+
# @return [true, false]
|
42
|
+
#
|
43
|
+
def wellformed?
|
44
|
+
begin
|
45
|
+
Nokogiri::XML(File.read(@filename)) { |config| config.strict }
|
46
|
+
|
47
|
+
true
|
48
|
+
rescue Nokogiri::XML::SyntaxError => _
|
49
|
+
@errors << 'XML file is not wellformed'
|
50
|
+
|
51
|
+
false
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Checks if the PROIEL XML file has a valid schema version number.
|
56
|
+
#
|
57
|
+
# If invalid, an error message will be appended to `errors`.
|
58
|
+
#
|
59
|
+
# @return [true, false]
|
60
|
+
#
|
61
|
+
def valid_schema_version?
|
62
|
+
schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)
|
63
|
+
|
64
|
+
if schema_version.nil?
|
65
|
+
@errors << 'invalid schema version'
|
66
|
+
|
67
|
+
false
|
68
|
+
else
|
69
|
+
true
|
70
|
+
end
|
71
|
+
rescue PROIEL::PROIELXML::Schema::InvalidSchemaVersion => e
|
72
|
+
@errors << e.message
|
73
|
+
|
74
|
+
false
|
75
|
+
end
|
76
|
+
|
77
|
+
# Checks if the PROIEL XML file validates against the schema.
|
78
|
+
#
|
79
|
+
# If invalid, error messages will be appended to `errors`.
|
80
|
+
#
|
81
|
+
# @return [true, false]
|
82
|
+
#
|
83
|
+
def validates?
|
84
|
+
doc = Nokogiri::XML(File.read(@filename))
|
85
|
+
|
86
|
+
schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)
|
87
|
+
|
88
|
+
schema = PROIEL::PROIELXML::Schema.load_proiel_xml_schema(schema_version)
|
89
|
+
r = schema.validate(doc)
|
90
|
+
|
91
|
+
if r.empty?
|
92
|
+
true
|
93
|
+
else
|
94
|
+
@errors += r.map { |e| "Line #{e.line}: #{e.message}" }
|
95
|
+
|
96
|
+
false
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Checks the referential integrity of the PROIEL XML file.
|
101
|
+
#
|
102
|
+
# If inconsistencies are found, error messages will be appended to `errors`.
|
103
|
+
#
|
104
|
+
# @return [true, false]
|
105
|
+
#
|
106
|
+
def has_referential_integrity?
|
107
|
+
tb = PROIEL::Treebank.new
|
108
|
+
tb.load_from_xml(@filename)
|
109
|
+
|
110
|
+
errors = []
|
111
|
+
|
112
|
+
# Pass 1: keep track of all object IDs and look for duplicates
|
113
|
+
sentence_ids = {}
|
114
|
+
token_ids = {}
|
115
|
+
|
116
|
+
tb.sources.each do |source|
|
117
|
+
source.divs.each do |div|
|
118
|
+
div.sentences.each do |sentence|
|
119
|
+
errors << "Repeated sentence ID #{sentence.id}" if sentence_ids.key?(sentence.id)
|
120
|
+
sentence_ids[sentence.id] = true
|
121
|
+
|
122
|
+
sentence.tokens.each do |token|
|
123
|
+
errors << "Repeated token ID #{token.id}" if token_ids.key?(token.id)
|
124
|
+
token_ids[token.id] = { sentence: sentence.id, div: div.id, source: source.id }
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Pass 2: check object ID references
|
131
|
+
tb.sources.each do |source|
|
132
|
+
source.tokens.each do |token|
|
133
|
+
# Head IDs and slash IDs should be sentence internal
|
134
|
+
check_reference_locality(errors, token, token_ids, :head_id, token.head_id, domain: :sentence, allow_nil: true)
|
135
|
+
|
136
|
+
token.slashes.each do |_, target_id|
|
137
|
+
check_reference_locality(errors, token, token_ids, :slash_id, target_id, domain: :sentence, allow_nil: false)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Antecedent IDs should be source internal
|
141
|
+
check_reference_locality(errors, token, token_ids, :antecedent_id, token.antecedent_id, domain: :source, allow_nil: true)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Pass 3: verify that all features are defined
|
146
|
+
# TBD
|
147
|
+
|
148
|
+
if errors.empty?
|
149
|
+
true
|
150
|
+
else
|
151
|
+
@errors += errors
|
152
|
+
|
153
|
+
false
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
private
|
158
|
+
|
159
|
+
def check_reference_locality(errors, token, token_ids, attribute_name,
|
160
|
+
attribute_value, domain: :sentence, allow_nil: false)
|
161
|
+
if attribute_value
|
162
|
+
referenced_token = token_ids[attribute_value]
|
163
|
+
|
164
|
+
if referenced_token.nil?
|
165
|
+
errors << "Token #{token.id}: #{attribute_name} references an unknown token"
|
166
|
+
elsif referenced_token[domain] != token.send(domain).id
|
167
|
+
errors << "Token #{token.id}: #{attribute_name} references a token in a different #{domain}"
|
168
|
+
end
|
169
|
+
elsif allow_nil
|
170
|
+
# Everything is fine...
|
171
|
+
else
|
172
|
+
errors << "Token #{token.id}: #{attribute_name} is null"
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,191 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
# A sentence object in a treebank.
|
8
|
+
class Sentence < TreebankObject
|
9
|
+
extend Memoist
|
10
|
+
|
11
|
+
# @return [Fixnum] ID of the sentence
|
12
|
+
attr_reader :id
|
13
|
+
|
14
|
+
# @return [Div] parent div object
|
15
|
+
attr_reader :div
|
16
|
+
|
17
|
+
# @return [Symbol] annotation status of sentence
|
18
|
+
attr_reader :status
|
19
|
+
|
20
|
+
# @return [nil, String] presentation material before sentence
|
21
|
+
attr_reader :presentation_before
|
22
|
+
|
23
|
+
# @return [nil, String] presentation material after sentence
|
24
|
+
attr_reader :presentation_after
|
25
|
+
|
26
|
+
# Creates a new sentence object.
|
27
|
+
def initialize(parent, id, status, presentation_before, presentation_after, &block)
|
28
|
+
@div = parent
|
29
|
+
|
30
|
+
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
31
|
+
@id = id
|
32
|
+
|
33
|
+
raise ArgumentError, 'string or symbol expected' unless status.is_a?(String) or status.is_a?(Symbol)
|
34
|
+
@status = status.to_sym
|
35
|
+
|
36
|
+
raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
|
37
|
+
@presentation_before = presentation_before.freeze
|
38
|
+
|
39
|
+
raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
|
40
|
+
@presentation_after = presentation_after.freeze
|
41
|
+
|
42
|
+
@children = block.call(self) if block_given?
|
43
|
+
end
|
44
|
+
|
45
|
+
# @return [Source] parent source object
|
46
|
+
def source
|
47
|
+
@div.source
|
48
|
+
end
|
49
|
+
|
50
|
+
# @return [Treebank] parent treebank object
|
51
|
+
def treebank
|
52
|
+
@div.source.treebank
|
53
|
+
end
|
54
|
+
|
55
|
+
# @return [String] language of the sentence as an ISO 639-3 language tag
|
56
|
+
def language
|
57
|
+
source.language
|
58
|
+
end
|
59
|
+
|
60
|
+
memoize :language
|
61
|
+
|
62
|
+
# @return [String] the complete citation for the sentence
|
63
|
+
def citation
|
64
|
+
[source.citation_part, citation_part].join(' ')
|
65
|
+
end
|
66
|
+
|
67
|
+
# Computes an appropriate citation component for the sentence.
|
68
|
+
#
|
69
|
+
# The computed citation component must be concatenated with the citation
|
70
|
+
# component provided by the source to produce a complete citation.
|
71
|
+
#
|
72
|
+
# @see citation
|
73
|
+
#
|
74
|
+
# @return [String] the citation component
|
75
|
+
def citation_part
|
76
|
+
tc = @children.select(&:has_citation?)
|
77
|
+
x = tc.first ? tc.first.citation_part : nil
|
78
|
+
y = tc.last ? tc.last.citation_part : nil
|
79
|
+
|
80
|
+
Citations.citation_make_range(x, y)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns the printable form of the sentence with all token forms and any
|
84
|
+
# presentation data.
|
85
|
+
#
|
86
|
+
# @return [String] the printable form of the sentence
|
87
|
+
def printable_form(options = {})
|
88
|
+
[presentation_before,
|
89
|
+
@children.map { |t| t.printable_form(options) },
|
90
|
+
presentation_after].compact.join
|
91
|
+
end
|
92
|
+
|
93
|
+
# Checks if the sentence is reviewed.
|
94
|
+
#
|
95
|
+
# A sentence has been reviewed if its `status` is `:reviewed`.
|
96
|
+
#
|
97
|
+
# @return [true,false]
|
98
|
+
def reviewed?
|
99
|
+
@status == :reviewed
|
100
|
+
end
|
101
|
+
|
102
|
+
# Checks if the sentence is annotated.
|
103
|
+
#
|
104
|
+
# Since only annotated sentences can be reviewed, a sentence is annotated
|
105
|
+
# if its `status` is either `:reviewed` or `:annotated`.
|
106
|
+
#
|
107
|
+
# @return [true,false]
|
108
|
+
def annotated?
|
109
|
+
@status == :reviewed or @status == :annotated
|
110
|
+
end
|
111
|
+
|
112
|
+
# Checks if the sentence is unannotated.
|
113
|
+
#
|
114
|
+
# A sentence is unannotated if its `status` is `:unannotated`.
|
115
|
+
#
|
116
|
+
# @return [true,false]
|
117
|
+
def unannotated?
|
118
|
+
@status == :unannotated
|
119
|
+
end
|
120
|
+
|
121
|
+
# Builds a syntax graph for the dependency annotation of the sentence and
|
122
|
+
# inserts a dummy root node. The graph is represented as a hash of
|
123
|
+
# hashes. Each hash contains the ID of the token, its relation (to its
|
124
|
+
# syntatically dominating token) and a list of secondary edges.
|
125
|
+
#
|
126
|
+
# @return [Hash] a single graph with a dummy root node represented as a hash
|
127
|
+
#
|
128
|
+
# @example
|
129
|
+
#
|
130
|
+
# sentence.syntax_graph # => [id: nil, relation: nil, children: [{ id: 1000, relation: "pred", children: [ { id: 1001, relation: "xcomp", children: [], slashes: [["xsub", 1000]]}]}], slashes: []]
|
131
|
+
#
|
132
|
+
def syntax_graph
|
133
|
+
{ id: nil, relation: nil, children: syntax_graphs, slashes: [] }
|
134
|
+
end
|
135
|
+
|
136
|
+
# Builds syntax graphs for the dependency annotation of the sentence.
|
137
|
+
# Multiple graphs may be returned as the function does not insert an
|
138
|
+
# empty dummy root node. Each graph is represented as a hash of hashes.
|
139
|
+
# Each hash contains the ID of the token, its relation (to its
|
140
|
+
# syntatically dominating token) and a list of secondary edges.
|
141
|
+
#
|
142
|
+
# @return [Array] zero or more syntax graphs represented as hashes
|
143
|
+
#
|
144
|
+
# @example Get a single syntax graph with a dummy root node
|
145
|
+
#
|
146
|
+
# sentence.syntax_graphs # => [{ id: 1000, relation: "pred", children: [ { id: 1001, relation: "xcomp", children: [], slashes: [["xsub", 1000]]}]}]
|
147
|
+
#
|
148
|
+
def syntax_graphs
|
149
|
+
Array.new.tap do |graphs|
|
150
|
+
token_map = {}
|
151
|
+
|
152
|
+
# Pass 1: create new attribute hashes for each token and index each hash by token ID
|
153
|
+
@children.each do |token|
|
154
|
+
token_map[token.id] =
|
155
|
+
{
|
156
|
+
id: token.id,
|
157
|
+
relation: token.relation,
|
158
|
+
children: [],
|
159
|
+
slashes: token.slashes,
|
160
|
+
}
|
161
|
+
end
|
162
|
+
|
163
|
+
# Pass 2: append attribute hashes for tokens with a head ID to the head's children list; append attribute hashes for tokens without a head ID to the list of graphs to return
|
164
|
+
@children.each do |token|
|
165
|
+
if token.head_id
|
166
|
+
token_map[token.head_id][:children] << token_map[token.id]
|
167
|
+
else
|
168
|
+
graphs << token_map[token.id]
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# Finds all tokens in the sentence.
|
175
|
+
#
|
176
|
+
# @return [Enumerator] tokens in the sentence
|
177
|
+
#
|
178
|
+
# @example Iterating tokens
|
179
|
+
# tokens.each { |t| puts t.id }
|
180
|
+
#
|
181
|
+
# @example Create an array with only empty tokens
|
182
|
+
# tokens.select(&:is_empty?)
|
183
|
+
#
|
184
|
+
# @example Counting tokens
|
185
|
+
# puts tokens.count #=> 200
|
186
|
+
#
|
187
|
+
def tokens
|
188
|
+
@children.to_enum
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|