proiel 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,237 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module PROIELXML
8
+ # @api private
9
+ module Reader
10
+ # Parsing class for `slash` elements.
11
+ class Slash
12
+ include SAXMachine
13
+
14
+ attribute :'target-id', as: :target_id, class: Integer, required: true
15
+ attribute :relation, required: true
16
+ end
17
+
18
+ # Parsing class for `token` elements.
19
+ class Token
20
+ include SAXMachine
21
+
22
+ attribute :id, class: Integer, required: true
23
+ attribute :'head-id', as: :head_id, class: Integer
24
+ attribute :form
25
+ attribute :lemma
26
+ attribute :'part-of-speech', as: :part_of_speech
27
+ attribute :morphology
28
+ attribute :relation
29
+ attribute :'empty-token-sort', as: :empty_token_sort
30
+ attribute :'citation-part', as: :citation_part
31
+ attribute :'presentation-before', as: :presentation_before
32
+ attribute :'presentation-after', as: :presentation_after
33
+ attribute :'antecedent-id', as: :antecedent_id, class: Integer
34
+ attribute :'information-status', as: :information_status
35
+ attribute :'contrast-group', as: :contrast_group
36
+ attribute :'foreign-ids', as: :foreign_ids
37
+
38
+ elements :slash, as: :slashes, class: Slash
39
+ end
40
+
41
+ # Parsing class for `sentence` elements.
42
+ class Sentence
43
+ include SAXMachine
44
+
45
+ attribute :id, class: Integer, required: true
46
+ attribute :status, class: Symbol, default: :unannotated
47
+ attribute :'presentation-before', as: :presentation_before
48
+ attribute :'presentation-after', as: :presentation_after
49
+
50
+ elements :token, as: :tokens, class: Token
51
+ end
52
+
53
+ # Parsing class for `div` elements.
54
+ class Div
55
+ include SAXMachine
56
+
57
+ attribute :id
58
+ attribute :'presentation-before', as: :presentation_before
59
+ attribute :'presentation-after', as: :presentation_after
60
+
61
+ element :title
62
+ elements :sentence, as: :sentences, class: Sentence
63
+ end
64
+
65
+ # Parsing class for `source` elements.
66
+ class Source
67
+ include SAXMachine
68
+
69
+ attribute :id, required: true
70
+ attribute :language, required: true
71
+
72
+ element :title
73
+ element :author
74
+ element :citation_part
75
+ element :principal
76
+ element :funder
77
+ element :distributor
78
+ element :distributor_address
79
+ element :date
80
+ element :license
81
+ element :license_url
82
+ element :reference_system
83
+ element :editor
84
+ element :editorial_note
85
+ element :annotator
86
+ element :reviewer
87
+ element :electronic_text_editor
88
+ element :electronic_text_title
89
+ element :electronic_text_version
90
+ element :electronic_text_publisher
91
+ element :electronic_text_place
92
+ element :electronic_text_date
93
+ element :electronic_text_original_url
94
+ element :electronic_text_license
95
+ element :electronic_text_license_url
96
+ element :printed_text_editor
97
+ element :printed_text_title
98
+ element :printed_text_edition
99
+ element :printed_text_publisher
100
+ element :printed_text_place
101
+ element :printed_text_date
102
+ elements :div, as: :divs, class: Div
103
+ end
104
+
105
+ # Parsing class for `relations/value` elements.
106
+ class RelationValue
107
+ include SAXMachine
108
+
109
+ attribute :tag, required: true
110
+ attribute :summary, required: true
111
+ attribute :primary, required: true
112
+ attribute :secondary, required: true
113
+ end
114
+
115
+ # Parsing class for `relations` elements.
116
+ class Relations
117
+ include SAXMachine
118
+
119
+ elements :value, as: :values, class: RelationValue
120
+ end
121
+
122
+ # Parsing class for `parts_of_speech/value` elements.
123
+ class PartOfSpeechValue
124
+ include SAXMachine
125
+
126
+ attribute :tag, required: true
127
+ attribute :summary, required: true
128
+ end
129
+
130
+ # Parsing class for `parts_of_speech` elements.
131
+ class PartsOfSpeech
132
+ include SAXMachine
133
+
134
+ elements :value, as: :values, class: PartOfSpeechValue
135
+ end
136
+
137
+ # Parsing class for `morphology/field/value` elements.
138
+ class MorphologyValue
139
+ include SAXMachine
140
+
141
+ attribute :tag, required: true
142
+ attribute :summary, required: true
143
+ end
144
+
145
+ # Parsing class for `morphology/field` elements.
146
+ class MorphologyField
147
+ include SAXMachine
148
+
149
+ attribute :tag, required: true
150
+
151
+ elements :value, as: :values, class: MorphologyValue
152
+ end
153
+
154
+ # Parsing class for `morphology` elements.
155
+ class Morphology
156
+ include SAXMachine
157
+
158
+ elements :field, as: :fields, class: MorphologyField
159
+ end
160
+
161
+ # Parsing class for `information_statuses/value` elements.
162
+ class InformationStatusValue
163
+ include SAXMachine
164
+
165
+ attribute :tag, required: true
166
+ attribute :summary, required: true
167
+ end
168
+
169
+ # Parsing class for `information_statuses` elements.
170
+ class InformationStatuses
171
+ include SAXMachine
172
+
173
+ elements :value, as: :values, class: InformationStatusValue
174
+ end
175
+
176
+ # Parsing class for `annotation` elements.
177
+ class Annotation
178
+ include SAXMachine
179
+
180
+ element :relations, class: Relations
181
+ element :parts_of_speech, as: :parts_of_speech, class: PartsOfSpeech
182
+ element :morphology, class: Morphology
183
+ element :information_statuses, as: :information_statuses, class: InformationStatuses
184
+ end
185
+
186
+ # Parsing class for `proiel` elements.
187
+ class Proiel
188
+ include SAXMachine
189
+
190
+ attribute :'export-time', as: :export_time
191
+ attribute :'schema-version', as: :schema_version, required: true
192
+
193
+ elements :source, as: :sources, class: Source
194
+ element :annotation, class: Annotation
195
+ end
196
+
197
+ # Top-level parsing class for a PROIEL XML file.
198
+ class TreebankFile
199
+ include SAXMachine
200
+
201
+ element :proiel, class: Proiel
202
+ end
203
+
204
+ # Parses PROIEL XML data.
205
+ #
206
+ # This does not automatically validate the PROIEL XML. If given an
207
+ # invalid PROIEL XML file, parsing is likely to succeed but the returned
208
+ # objects will be in an inconsistent state.
209
+ #
210
+ # @see parse_io
211
+ #
212
+ # @param xml [String] PROIEL XML to parse
213
+ #
214
+ # @return [TreebankFile]
215
+ #
216
+ def self.parse_xml(xml)
217
+ TreebankFile.parse(xml)
218
+ end
219
+
220
+ # Parses a PROIEL XML file.
221
+ #
222
+ # This does not automatically validate the PROIEL XML. If given an
223
+ # invalid PROIEL XML file, parsing is likely to succeed but the returned
224
+ # objects will be in an inconsistent state.
225
+ #
226
+ # @see parse_xml
227
+ #
228
+ # @param io [IO] stream representing the PROIEL XML file
229
+ #
230
+ # @return [TreebankFile]
231
+ #
232
+ def self.parse_io(io)
233
+ parse_xml(io.read)
234
+ end
235
+ end
236
+ end
237
+ end
@@ -0,0 +1,81 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module PROIELXML
8
+ # Functionality concerned with PROIEL XML schema loading and versioning.
9
+ # Functionality for validation using a PROIEL XML schema is found in
10
+ # {PROIEL::PROIELXML::Validator}.
11
+ #
12
+ # @api private
13
+ module Schema
14
+ # Returns the current version of the PROIEL XML schema.
15
+ #
16
+ # @return [String] schema version number
17
+ #
18
+ def self.current_proiel_xml_schema_version
19
+ '2.0'
20
+ end
21
+
22
+ # Invalid PROIEL XML schema version error.
23
+ #
24
+ # This represents an error that occurs when an unknown PROIEL XML schema
25
+ # version number is encountered or one that could not be parsed.
26
+ class InvalidSchemaVersion < RuntimeError; end
27
+
28
+ # Opens a PROIEL XML schema file and peek at the schema version number
29
+ # that the file claims it conforms to.
30
+ #
31
+ # @return [String] schema version number
32
+ #
33
+ # @raise InvalidSchemaVersion
34
+ #
35
+ def self.check_schema_version_of_xml_file(filename)
36
+ doc = Nokogiri::XML(File.read(filename))
37
+
38
+ if doc and doc.root and doc.root.name == 'proiel'
39
+ case doc.root.attr('schema-version')
40
+ when '2.0'
41
+ '2.0'
42
+ when NilClass
43
+ '1.0'
44
+ else
45
+ raise InvalidSchemaVersion, 'invalid schema version number'
46
+ end
47
+ else
48
+ raise InvalidSchemaVersion, 'top-level XML element not found'
49
+ end
50
+ end
51
+
52
+ # Loads a PROIEL XML schema.
53
+ #
54
+ # @return [Nokogiri::XML::Schema] schema version number
55
+ #
56
+ # @raise RuntimeError
57
+ #
58
+ def self.load_proiel_xml_schema(schema_version)
59
+ filename = proiel_xml_schema_filename(schema_version)
60
+
61
+ Nokogiri::XML::Schema(File.open(filename).read)
62
+ end
63
+
64
+ # Determines the filename of a specific version of the PROIEL XML schema.
65
+ #
66
+ # @return [String] filename
67
+ #
68
+ # @raise ArgumentError
69
+ #
70
+ def self.proiel_xml_schema_filename(schema_version)
71
+ if schema_version == '1.0' or schema_version == '2.0'
72
+ File.join(File.dirname(__FILE__),
73
+ "proiel-#{schema_version}",
74
+ "proiel-#{schema_version}.xsd")
75
+ else
76
+ raise ArgumentError, 'invalid schema version'
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,177 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module PROIELXML
8
+ # A validator object that uses an XML schema as well as additional
9
+ # integrity checks to validate a PROIEL XML file. Functionality for
10
+ # loading the XML schema and checking the PROIEL XML version number is
11
+ # found in {PROIEL::PROIELXML::Schema}.
12
+ class Validator
13
+ # Returns an array of error messages generated during validation.
14
+ attr_reader :errors
15
+
16
+ # Creates a new validator for a PROIEL XML file.
17
+ #
18
+ # @param filename [String] name of PROIEL XML file to validate
19
+ #
20
+ def initialize(filename)
21
+ @filename = filename
22
+ @errors = []
23
+ end
24
+
25
+ # Checks if the PROIEL XML file is valid. This checks for
26
+ # well-formedness, a valid schema version, validation against the schema
27
+ # and referential integrity.
28
+ #
29
+ # If invalid, `errors` will contain error messages.
30
+ #
31
+ # @return [true, false]
32
+ #
33
+ def valid?
34
+ wellformed? and valid_schema_version? and validates? and has_referential_integrity?
35
+ end
36
+
37
+ # Checks if the PROIEL XML file is well-formed XML.
38
+ #
39
+ # If not well-formed, an error message will be appended to `errors`.
40
+ #
41
+ # @return [true, false]
42
+ #
43
+ def wellformed?
44
+ begin
45
+ Nokogiri::XML(File.read(@filename)) { |config| config.strict }
46
+
47
+ true
48
+ rescue Nokogiri::XML::SyntaxError => _
49
+ @errors << 'XML file is not wellformed'
50
+
51
+ false
52
+ end
53
+ end
54
+
55
+ # Checks if the PROIEL XML file has a valid schema version number.
56
+ #
57
+ # If invalid, an error message will be appended to `errors`.
58
+ #
59
+ # @return [true, false]
60
+ #
61
+ def valid_schema_version?
62
+ schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)
63
+
64
+ if schema_version.nil?
65
+ @errors << 'invalid schema version'
66
+
67
+ false
68
+ else
69
+ true
70
+ end
71
+ rescue PROIEL::PROIELXML::Schema::InvalidSchemaVersion => e
72
+ @errors << e.message
73
+
74
+ false
75
+ end
76
+
77
+ # Checks if the PROIEL XML file validates against the schema.
78
+ #
79
+ # If invalid, error messages will be appended to `errors`.
80
+ #
81
+ # @return [true, false]
82
+ #
83
+ def validates?
84
+ doc = Nokogiri::XML(File.read(@filename))
85
+
86
+ schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)
87
+
88
+ schema = PROIEL::PROIELXML::Schema.load_proiel_xml_schema(schema_version)
89
+ r = schema.validate(doc)
90
+
91
+ if r.empty?
92
+ true
93
+ else
94
+ @errors += r.map { |e| "Line #{e.line}: #{e.message}" }
95
+
96
+ false
97
+ end
98
+ end
99
+
100
+ # Checks the referential integrity of the PROIEL XML file.
101
+ #
102
+ # If inconsistencies are found, error messages will be appended to `errors`.
103
+ #
104
+ # @return [true, false]
105
+ #
106
+ def has_referential_integrity?
107
+ tb = PROIEL::Treebank.new
108
+ tb.load_from_xml(@filename)
109
+
110
+ errors = []
111
+
112
+ # Pass 1: keep track of all object IDs and look for duplicates
113
+ sentence_ids = {}
114
+ token_ids = {}
115
+
116
+ tb.sources.each do |source|
117
+ source.divs.each do |div|
118
+ div.sentences.each do |sentence|
119
+ errors << "Repeated sentence ID #{sentence.id}" if sentence_ids.key?(sentence.id)
120
+ sentence_ids[sentence.id] = true
121
+
122
+ sentence.tokens.each do |token|
123
+ errors << "Repeated token ID #{token.id}" if token_ids.key?(token.id)
124
+ token_ids[token.id] = { sentence: sentence.id, div: div.id, source: source.id }
125
+ end
126
+ end
127
+ end
128
+ end
129
+
130
+ # Pass 2: check object ID references
131
+ tb.sources.each do |source|
132
+ source.tokens.each do |token|
133
+ # Head IDs and slash IDs should be sentence internal
134
+ check_reference_locality(errors, token, token_ids, :head_id, token.head_id, domain: :sentence, allow_nil: true)
135
+
136
+ token.slashes.each do |_, target_id|
137
+ check_reference_locality(errors, token, token_ids, :slash_id, target_id, domain: :sentence, allow_nil: false)
138
+ end
139
+
140
+ # Antecedent IDs should be source internal
141
+ check_reference_locality(errors, token, token_ids, :antecedent_id, token.antecedent_id, domain: :source, allow_nil: true)
142
+ end
143
+ end
144
+
145
+ # Pass 3: verify that all features are defined
146
+ # TBD
147
+
148
+ if errors.empty?
149
+ true
150
+ else
151
+ @errors += errors
152
+
153
+ false
154
+ end
155
+ end
156
+
157
+ private
158
+
159
+ def check_reference_locality(errors, token, token_ids, attribute_name,
160
+ attribute_value, domain: :sentence, allow_nil: false)
161
+ if attribute_value
162
+ referenced_token = token_ids[attribute_value]
163
+
164
+ if referenced_token.nil?
165
+ errors << "Token #{token.id}: #{attribute_name} references an unknown token"
166
+ elsif referenced_token[domain] != token.send(domain).id
167
+ errors << "Token #{token.id}: #{attribute_name} references a token in a different #{domain}"
168
+ end
169
+ elsif allow_nil
170
+ # Everything is fine...
171
+ else
172
+ errors << "Token #{token.id}: #{attribute_name} is null"
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,191 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # A sentence object in a treebank.
8
+ class Sentence < TreebankObject
9
+ extend Memoist
10
+
11
+ # @return [Fixnum] ID of the sentence
12
+ attr_reader :id
13
+
14
+ # @return [Div] parent div object
15
+ attr_reader :div
16
+
17
+ # @return [Symbol] annotation status of sentence
18
+ attr_reader :status
19
+
20
+ # @return [nil, String] presentation material before sentence
21
+ attr_reader :presentation_before
22
+
23
+ # @return [nil, String] presentation material after sentence
24
+ attr_reader :presentation_after
25
+
26
+ # Creates a new sentence object.
27
+ def initialize(parent, id, status, presentation_before, presentation_after, &block)
28
+ @div = parent
29
+
30
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
31
+ @id = id
32
+
33
+ raise ArgumentError, 'string or symbol expected' unless status.is_a?(String) or status.is_a?(Symbol)
34
+ @status = status.to_sym
35
+
36
+ raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
37
+ @presentation_before = presentation_before.freeze
38
+
39
+ raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
40
+ @presentation_after = presentation_after.freeze
41
+
42
+ @children = block.call(self) if block_given?
43
+ end
44
+
45
+ # @return [Source] parent source object
46
+ def source
47
+ @div.source
48
+ end
49
+
50
+ # @return [Treebank] parent treebank object
51
+ def treebank
52
+ @div.source.treebank
53
+ end
54
+
55
+ # @return [String] language of the sentence as an ISO 639-3 language tag
56
+ def language
57
+ source.language
58
+ end
59
+
60
+ memoize :language
61
+
62
+ # @return [String] the complete citation for the sentence
63
+ def citation
64
+ [source.citation_part, citation_part].join(' ')
65
+ end
66
+
67
+ # Computes an appropriate citation component for the sentence.
68
+ #
69
+ # The computed citation component must be concatenated with the citation
70
+ # component provided by the source to produce a complete citation.
71
+ #
72
+ # @see citation
73
+ #
74
+ # @return [String] the citation component
75
+ def citation_part
76
+ tc = @children.select(&:has_citation?)
77
+ x = tc.first ? tc.first.citation_part : nil
78
+ y = tc.last ? tc.last.citation_part : nil
79
+
80
+ Citations.citation_make_range(x, y)
81
+ end
82
+
83
+ # Returns the printable form of the sentence with all token forms and any
84
+ # presentation data.
85
+ #
86
+ # @return [String] the printable form of the sentence
87
+ def printable_form(options = {})
88
+ [presentation_before,
89
+ @children.map { |t| t.printable_form(options) },
90
+ presentation_after].compact.join
91
+ end
92
+
93
+ # Checks if the sentence is reviewed.
94
+ #
95
+ # A sentence has been reviewed if its `status` is `:reviewed`.
96
+ #
97
+ # @return [true,false]
98
+ def reviewed?
99
+ @status == :reviewed
100
+ end
101
+
102
+ # Checks if the sentence is annotated.
103
+ #
104
+ # Since only annotated sentences can be reviewed, a sentence is annotated
105
+ # if its `status` is either `:reviewed` or `:annotated`.
106
+ #
107
+ # @return [true,false]
108
+ def annotated?
109
+ @status == :reviewed or @status == :annotated
110
+ end
111
+
112
+ # Checks if the sentence is unannotated.
113
+ #
114
+ # A sentence is unannotated if its `status` is `:unannotated`.
115
+ #
116
+ # @return [true,false]
117
+ def unannotated?
118
+ @status == :unannotated
119
+ end
120
+
121
+ # Builds a syntax graph for the dependency annotation of the sentence and
122
+ # inserts a dummy root node. The graph is represented as a hash of
123
+ # hashes. Each hash contains the ID of the token, its relation (to its
124
+ # syntatically dominating token) and a list of secondary edges.
125
+ #
126
+ # @return [Hash] a single graph with a dummy root node represented as a hash
127
+ #
128
+ # @example
129
+ #
130
+ # sentence.syntax_graph # => [id: nil, relation: nil, children: [{ id: 1000, relation: "pred", children: [ { id: 1001, relation: "xcomp", children: [], slashes: [["xsub", 1000]]}]}], slashes: []]
131
+ #
132
+ def syntax_graph
133
+ { id: nil, relation: nil, children: syntax_graphs, slashes: [] }
134
+ end
135
+
136
+ # Builds syntax graphs for the dependency annotation of the sentence.
137
+ # Multiple graphs may be returned as the function does not insert an
138
+ # empty dummy root node. Each graph is represented as a hash of hashes.
139
+ # Each hash contains the ID of the token, its relation (to its
140
+ # syntatically dominating token) and a list of secondary edges.
141
+ #
142
+ # @return [Array] zero or more syntax graphs represented as hashes
143
+ #
144
+ # @example Get a single syntax graph with a dummy root node
145
+ #
146
+ # sentence.syntax_graphs # => [{ id: 1000, relation: "pred", children: [ { id: 1001, relation: "xcomp", children: [], slashes: [["xsub", 1000]]}]}]
147
+ #
148
+ def syntax_graphs
149
+ Array.new.tap do |graphs|
150
+ token_map = {}
151
+
152
+ # Pass 1: create new attribute hashes for each token and index each hash by token ID
153
+ @children.each do |token|
154
+ token_map[token.id] =
155
+ {
156
+ id: token.id,
157
+ relation: token.relation,
158
+ children: [],
159
+ slashes: token.slashes,
160
+ }
161
+ end
162
+
163
+ # Pass 2: append attribute hashes for tokens with a head ID to the head's children list; append attribute hashes for tokens without a head ID to the list of graphs to return
164
+ @children.each do |token|
165
+ if token.head_id
166
+ token_map[token.head_id][:children] << token_map[token.id]
167
+ else
168
+ graphs << token_map[token.id]
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ # Finds all tokens in the sentence.
175
+ #
176
+ # @return [Enumerator] tokens in the sentence
177
+ #
178
+ # @example Iterating tokens
179
+ # tokens.each { |t| puts t.id }
180
+ #
181
+ # @example Create an array with only empty tokens
182
+ # tokens.select(&:is_empty?)
183
+ #
184
+ # @example Counting tokens
185
+ # puts tokens.count #=> 200
186
+ #
187
+ def tokens
188
+ @children.to_enum
189
+ end
190
+ end
191
+ end