proiel 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module PROIELXML
8
+ # @api private
9
+ module Reader
10
+ # Parsing class for `slash` elements.
11
+ class Slash
12
+ include SAXMachine
13
+
14
+ attribute :'target-id', as: :target_id, class: Integer, required: true
15
+ attribute :relation, required: true
16
+ end
17
+
18
+ # Parsing class for `token` elements.
19
+ class Token
20
+ include SAXMachine
21
+
22
+ attribute :id, class: Integer, required: true
23
+ attribute :'head-id', as: :head_id, class: Integer
24
+ attribute :form
25
+ attribute :lemma
26
+ attribute :'part-of-speech', as: :part_of_speech
27
+ attribute :morphology
28
+ attribute :relation
29
+ attribute :'empty-token-sort', as: :empty_token_sort
30
+ attribute :'citation-part', as: :citation_part
31
+ attribute :'presentation-before', as: :presentation_before
32
+ attribute :'presentation-after', as: :presentation_after
33
+ attribute :'antecedent-id', as: :antecedent_id, class: Integer
34
+ attribute :'information-status', as: :information_status
35
+ attribute :'contrast-group', as: :contrast_group
36
+ attribute :'foreign-ids', as: :foreign_ids
37
+
38
+ elements :slash, as: :slashes, class: Slash
39
+ end
40
+
41
+ # Parsing class for `sentence` elements.
42
+ class Sentence
43
+ include SAXMachine
44
+
45
+ attribute :id, class: Integer, required: true
46
+ attribute :status, class: Symbol, default: :unannotated
47
+ attribute :'presentation-before', as: :presentation_before
48
+ attribute :'presentation-after', as: :presentation_after
49
+
50
+ elements :token, as: :tokens, class: Token
51
+ end
52
+
53
+ # Parsing class for `div` elements.
54
+ class Div
55
+ include SAXMachine
56
+
57
+ attribute :id
58
+ attribute :'presentation-before', as: :presentation_before
59
+ attribute :'presentation-after', as: :presentation_after
60
+
61
+ element :title
62
+ elements :sentence, as: :sentences, class: Sentence
63
+ end
64
+
65
+ # Parsing class for `source` elements.
66
+ class Source
67
+ include SAXMachine
68
+
69
+ attribute :id, required: true
70
+ attribute :language, required: true
71
+
72
+ element :title
73
+ element :author
74
+ element :citation_part
75
+ element :principal
76
+ element :funder
77
+ element :distributor
78
+ element :distributor_address
79
+ element :date
80
+ element :license
81
+ element :license_url
82
+ element :reference_system
83
+ element :editor
84
+ element :editorial_note
85
+ element :annotator
86
+ element :reviewer
87
+ element :electronic_text_editor
88
+ element :electronic_text_title
89
+ element :electronic_text_version
90
+ element :electronic_text_publisher
91
+ element :electronic_text_place
92
+ element :electronic_text_date
93
+ element :electronic_text_original_url
94
+ element :electronic_text_license
95
+ element :electronic_text_license_url
96
+ element :printed_text_editor
97
+ element :printed_text_title
98
+ element :printed_text_edition
99
+ element :printed_text_publisher
100
+ element :printed_text_place
101
+ element :printed_text_date
102
+ elements :div, as: :divs, class: Div
103
+ end
104
+
105
+ # Parsing class for `relations/value` elements.
106
+ class RelationValue
107
+ include SAXMachine
108
+
109
+ attribute :tag, required: true
110
+ attribute :summary, required: true
111
+ attribute :primary, required: true
112
+ attribute :secondary, required: true
113
+ end
114
+
115
+ # Parsing class for `relations` elements.
116
+ class Relations
117
+ include SAXMachine
118
+
119
+ elements :value, as: :values, class: RelationValue
120
+ end
121
+
122
+ # Parsing class for `parts_of_speech/value` elements.
123
+ class PartOfSpeechValue
124
+ include SAXMachine
125
+
126
+ attribute :tag, required: true
127
+ attribute :summary, required: true
128
+ end
129
+
130
+ # Parsing class for `parts_of_speech` elements.
131
+ class PartsOfSpeech
132
+ include SAXMachine
133
+
134
+ elements :value, as: :values, class: PartOfSpeechValue
135
+ end
136
+
137
+ # Parsing class for `morphology/field/value` elements.
138
+ class MorphologyValue
139
+ include SAXMachine
140
+
141
+ attribute :tag, required: true
142
+ attribute :summary, required: true
143
+ end
144
+
145
+ # Parsing class for `morphology/field` elements.
146
+ class MorphologyField
147
+ include SAXMachine
148
+
149
+ attribute :tag, required: true
150
+
151
+ elements :value, as: :values, class: MorphologyValue
152
+ end
153
+
154
+ # Parsing class for `morphology` elements.
155
+ class Morphology
156
+ include SAXMachine
157
+
158
+ elements :field, as: :fields, class: MorphologyField
159
+ end
160
+
161
+ # Parsing class for `information_statuses/value` elements.
162
+ class InformationStatusValue
163
+ include SAXMachine
164
+
165
+ attribute :tag, required: true
166
+ attribute :summary, required: true
167
+ end
168
+
169
+ # Parsing class for `information_statuses` elements.
170
+ class InformationStatuses
171
+ include SAXMachine
172
+
173
+ elements :value, as: :values, class: InformationStatusValue
174
+ end
175
+
176
+ # Parsing class for `annotation` elements.
177
+ class Annotation
178
+ include SAXMachine
179
+
180
+ element :relations, class: Relations
181
+ element :parts_of_speech, as: :parts_of_speech, class: PartsOfSpeech
182
+ element :morphology, class: Morphology
183
+ element :information_statuses, as: :information_statuses, class: InformationStatuses
184
+ end
185
+
186
+ # Parsing class for `proiel` elements.
187
+ class Proiel
188
+ include SAXMachine
189
+
190
+ attribute :'export-time', as: :export_time
191
+ attribute :'schema-version', as: :schema_version, required: true
192
+
193
+ elements :source, as: :sources, class: Source
194
+ element :annotation, class: Annotation
195
+ end
196
+
197
+ # Top-level parsing class for a PROIEL XML file.
198
+ class TreebankFile
199
+ include SAXMachine
200
+
201
+ element :proiel, class: Proiel
202
+ end
203
+
204
+ # Parses PROIEL XML data.
205
+ #
206
+ # This does not automatically validate the PROIEL XML. If given an
207
+ # invalid PROIEL XML file, parsing is likely to succeed but the returned
208
+ # objects will be in an inconsistent state.
209
+ #
210
+ # @see parse_io
211
+ #
212
+ # @param xml [String] PROIEL XML to parse
213
+ #
214
+ # @return [TreebankFile]
215
+ #
216
+ def self.parse_xml(xml)
217
+ TreebankFile.parse(xml)
218
+ end
219
+
220
+ # Parses a PROIEL XML file.
221
+ #
222
+ # This does not automatically validate the PROIEL XML. If given an
223
+ # invalid PROIEL XML file, parsing is likely to succeed but the returned
224
+ # objects will be in an inconsistent state.
225
+ #
226
+ # @see parse_xml
227
+ #
228
+ # @param io [IO] stream representing the PROIEL XML file
229
+ #
230
+ # @return [TreebankFile]
231
+ #
232
+ def self.parse_io(io)
233
+ parse_xml(io.read)
234
+ end
235
+ end
236
+ end
237
+ end
@@ -0,0 +1,81 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module PROIELXML
8
+ # Functionality concerned with PROIEL XML schema loading and versioning.
9
+ # Functionality for validation using a PROIEL XML schema is found in
10
+ # {PROIEL::PROIELXML::Validator}.
11
+ #
12
+ # @api private
13
+ module Schema
14
+ # Returns the current version of the PROIEL XML schema.
15
+ #
16
+ # @return [String] schema version number
17
+ #
18
+ def self.current_proiel_xml_schema_version
19
+ '2.0'
20
+ end
21
+
22
+ # Invalid PROIEL XML schema version error.
23
+ #
24
+ # This represents an error that occurs when an unknown PROIEL XML schema
25
+ # version number is encountered or one that could not be parsed.
26
+ class InvalidSchemaVersion < RuntimeError; end
27
+
28
+ # Opens a PROIEL XML schema file and peek at the schema version number
29
+ # that the file claims it conforms to.
30
+ #
31
+ # @return [String] schema version number
32
+ #
33
+ # @raise InvalidSchemaVersion
34
+ #
35
+ def self.check_schema_version_of_xml_file(filename)
36
+ doc = Nokogiri::XML(File.read(filename))
37
+
38
+ if doc and doc.root and doc.root.name == 'proiel'
39
+ case doc.root.attr('schema-version')
40
+ when '2.0'
41
+ '2.0'
42
+ when NilClass
43
+ '1.0'
44
+ else
45
+ raise InvalidSchemaVersion, 'invalid schema version number'
46
+ end
47
+ else
48
+ raise InvalidSchemaVersion, 'top-level XML element not found'
49
+ end
50
+ end
51
+
52
+ # Loads a PROIEL XML schema.
53
+ #
54
+ # @return [Nokogiri::XML::Schema] schema version number
55
+ #
56
+ # @raise RuntimeError
57
+ #
58
+ def self.load_proiel_xml_schema(schema_version)
59
+ filename = proiel_xml_schema_filename(schema_version)
60
+
61
+ Nokogiri::XML::Schema(File.open(filename).read)
62
+ end
63
+
64
+ # Determines the filename of a specific version of the PROIEL XML schema.
65
+ #
66
+ # @return [String] filename
67
+ #
68
+ # @raise ArgumentError
69
+ #
70
+ def self.proiel_xml_schema_filename(schema_version)
71
+ if schema_version == '1.0' or schema_version == '2.0'
72
+ File.join(File.dirname(__FILE__),
73
+ "proiel-#{schema_version}",
74
+ "proiel-#{schema_version}.xsd")
75
+ else
76
+ raise ArgumentError, 'invalid schema version'
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,177 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module PROIELXML
8
+ # A validator object that uses an XML schema as well as additional
9
+ # integrity checks to validate a PROIEL XML file. Functionality for
10
+ # loading the XML schema and checking the PROIEL XML version number is
11
+ # found in {PROIEL::PROIELXML::Schema}.
12
+ class Validator
13
+ # Returns an array of error messages generated during validation.
14
+ attr_reader :errors
15
+
16
+ # Creates a new validator for a PROIEL XML file.
17
+ #
18
+ # @param filename [String] name of PROIEL XML file to validate
19
+ #
20
+ def initialize(filename)
21
+ @filename = filename
22
+ @errors = []
23
+ end
24
+
25
+ # Checks if the PROIEL XML file is valid. This checks for
26
+ # well-formedness, a valid schema version, validation against the schema
27
+ # and referential integrity.
28
+ #
29
+ # If invalid, `errors` will contain error messages.
30
+ #
31
+ # @return [true, false]
32
+ #
33
+ def valid?
34
+ wellformed? and valid_schema_version? and validates? and has_referential_integrity?
35
+ end
36
+
37
+ # Checks if the PROIEL XML file is well-formed XML.
38
+ #
39
+ # If not well-formed, an error message will be appended to `errors`.
40
+ #
41
+ # @return [true, false]
42
+ #
43
+ def wellformed?
44
+ begin
45
+ Nokogiri::XML(File.read(@filename)) { |config| config.strict }
46
+
47
+ true
48
+ rescue Nokogiri::XML::SyntaxError => _
49
+ @errors << 'XML file is not wellformed'
50
+
51
+ false
52
+ end
53
+ end
54
+
55
+ # Checks if the PROIEL XML file has a valid schema version number.
56
+ #
57
+ # If invalid, an error message will be appended to `errors`.
58
+ #
59
+ # @return [true, false]
60
+ #
61
+ def valid_schema_version?
62
+ schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)
63
+
64
+ if schema_version.nil?
65
+ @errors << 'invalid schema version'
66
+
67
+ false
68
+ else
69
+ true
70
+ end
71
+ rescue PROIEL::PROIELXML::Schema::InvalidSchemaVersion => e
72
+ @errors << e.message
73
+
74
+ false
75
+ end
76
+
77
+ # Checks if the PROIEL XML file validates against the schema.
78
+ #
79
+ # If invalid, error messages will be appended to `errors`.
80
+ #
81
+ # @return [true, false]
82
+ #
83
+ def validates?
84
+ doc = Nokogiri::XML(File.read(@filename))
85
+
86
+ schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)
87
+
88
+ schema = PROIEL::PROIELXML::Schema.load_proiel_xml_schema(schema_version)
89
+ r = schema.validate(doc)
90
+
91
+ if r.empty?
92
+ true
93
+ else
94
+ @errors += r.map { |e| "Line #{e.line}: #{e.message}" }
95
+
96
+ false
97
+ end
98
+ end
99
+
100
+ # Checks the referential integrity of the PROIEL XML file.
101
+ #
102
+ # If inconsistencies are found, error messages will be appended to `errors`.
103
+ #
104
+ # @return [true, false]
105
+ #
106
+ def has_referential_integrity?
107
+ tb = PROIEL::Treebank.new
108
+ tb.load_from_xml(@filename)
109
+
110
+ errors = []
111
+
112
+ # Pass 1: keep track of all object IDs and look for duplicates
113
+ sentence_ids = {}
114
+ token_ids = {}
115
+
116
+ tb.sources.each do |source|
117
+ source.divs.each do |div|
118
+ div.sentences.each do |sentence|
119
+ errors << "Repeated sentence ID #{sentence.id}" if sentence_ids.key?(sentence.id)
120
+ sentence_ids[sentence.id] = true
121
+
122
+ sentence.tokens.each do |token|
123
+ errors << "Repeated token ID #{token.id}" if token_ids.key?(token.id)
124
+ token_ids[token.id] = { sentence: sentence.id, div: div.id, source: source.id }
125
+ end
126
+ end
127
+ end
128
+ end
129
+
130
+ # Pass 2: check object ID references
131
+ tb.sources.each do |source|
132
+ source.tokens.each do |token|
133
+ # Head IDs and slash IDs should be sentence internal
134
+ check_reference_locality(errors, token, token_ids, :head_id, token.head_id, domain: :sentence, allow_nil: true)
135
+
136
+ token.slashes.each do |_, target_id|
137
+ check_reference_locality(errors, token, token_ids, :slash_id, target_id, domain: :sentence, allow_nil: false)
138
+ end
139
+
140
+ # Antecedent IDs should be source internal
141
+ check_reference_locality(errors, token, token_ids, :antecedent_id, token.antecedent_id, domain: :source, allow_nil: true)
142
+ end
143
+ end
144
+
145
+ # Pass 3: verify that all features are defined
146
+ # TBD
147
+
148
+ if errors.empty?
149
+ true
150
+ else
151
+ @errors += errors
152
+
153
+ false
154
+ end
155
+ end
156
+
157
+ private
158
+
159
+ def check_reference_locality(errors, token, token_ids, attribute_name,
160
+ attribute_value, domain: :sentence, allow_nil: false)
161
+ if attribute_value
162
+ referenced_token = token_ids[attribute_value]
163
+
164
+ if referenced_token.nil?
165
+ errors << "Token #{token.id}: #{attribute_name} references an unknown token"
166
+ elsif referenced_token[domain] != token.send(domain).id
167
+ errors << "Token #{token.id}: #{attribute_name} references a token in a different #{domain}"
168
+ end
169
+ elsif allow_nil
170
+ # Everything is fine...
171
+ else
172
+ errors << "Token #{token.id}: #{attribute_name} is null"
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,191 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # A sentence object in a treebank.
8
+ class Sentence < TreebankObject
9
+ extend Memoist
10
+
11
+ # @return [Fixnum] ID of the sentence
12
+ attr_reader :id
13
+
14
+ # @return [Div] parent div object
15
+ attr_reader :div
16
+
17
+ # @return [Symbol] annotation status of sentence
18
+ attr_reader :status
19
+
20
+ # @return [nil, String] presentation material before sentence
21
+ attr_reader :presentation_before
22
+
23
+ # @return [nil, String] presentation material after sentence
24
+ attr_reader :presentation_after
25
+
26
+ # Creates a new sentence object.
27
+ def initialize(parent, id, status, presentation_before, presentation_after, &block)
28
+ @div = parent
29
+
30
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
31
+ @id = id
32
+
33
+ raise ArgumentError, 'string or symbol expected' unless status.is_a?(String) or status.is_a?(Symbol)
34
+ @status = status.to_sym
35
+
36
+ raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
37
+ @presentation_before = presentation_before.freeze
38
+
39
+ raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
40
+ @presentation_after = presentation_after.freeze
41
+
42
+ @children = block.call(self) if block_given?
43
+ end
44
+
45
+ # @return [Source] parent source object
46
+ def source
47
+ @div.source
48
+ end
49
+
50
+ # @return [Treebank] parent treebank object
51
+ def treebank
52
+ @div.source.treebank
53
+ end
54
+
55
+ # @return [String] language of the sentence as an ISO 639-3 language tag
56
+ def language
57
+ source.language
58
+ end
59
+
60
+ memoize :language
61
+
62
+ # @return [String] the complete citation for the sentence
63
+ def citation
64
+ [source.citation_part, citation_part].join(' ')
65
+ end
66
+
67
+ # Computes an appropriate citation component for the sentence.
68
+ #
69
+ # The computed citation component must be concatenated with the citation
70
+ # component provided by the source to produce a complete citation.
71
+ #
72
+ # @see citation
73
+ #
74
+ # @return [String] the citation component
75
+ def citation_part
76
+ tc = @children.select(&:has_citation?)
77
+ x = tc.first ? tc.first.citation_part : nil
78
+ y = tc.last ? tc.last.citation_part : nil
79
+
80
+ Citations.citation_make_range(x, y)
81
+ end
82
+
83
+ # Returns the printable form of the sentence with all token forms and any
84
+ # presentation data.
85
+ #
86
+ # @return [String] the printable form of the sentence
87
+ def printable_form(options = {})
88
+ [presentation_before,
89
+ @children.map { |t| t.printable_form(options) },
90
+ presentation_after].compact.join
91
+ end
92
+
93
+ # Checks if the sentence is reviewed.
94
+ #
95
+ # A sentence has been reviewed if its `status` is `:reviewed`.
96
+ #
97
+ # @return [true,false]
98
+ def reviewed?
99
+ @status == :reviewed
100
+ end
101
+
102
+ # Checks if the sentence is annotated.
103
+ #
104
+ # Since only annotated sentences can be reviewed, a sentence is annotated
105
+ # if its `status` is either `:reviewed` or `:annotated`.
106
+ #
107
+ # @return [true,false]
108
+ def annotated?
109
+ @status == :reviewed or @status == :annotated
110
+ end
111
+
112
+ # Checks if the sentence is unannotated.
113
+ #
114
+ # A sentence is unannotated if its `status` is `:unannotated`.
115
+ #
116
+ # @return [true,false]
117
+ def unannotated?
118
+ @status == :unannotated
119
+ end
120
+
121
+ # Builds a syntax graph for the dependency annotation of the sentence and
122
+ # inserts a dummy root node. The graph is represented as a hash of
123
+ # hashes. Each hash contains the ID of the token, its relation (to its
124
+ # syntatically dominating token) and a list of secondary edges.
125
+ #
126
+ # @return [Hash] a single graph with a dummy root node represented as a hash
127
+ #
128
+ # @example
129
+ #
130
+ # sentence.syntax_graph # => [id: nil, relation: nil, children: [{ id: 1000, relation: "pred", children: [ { id: 1001, relation: "xcomp", children: [], slashes: [["xsub", 1000]]}]}], slashes: []]
131
+ #
132
+ def syntax_graph
133
+ { id: nil, relation: nil, children: syntax_graphs, slashes: [] }
134
+ end
135
+
136
+ # Builds syntax graphs for the dependency annotation of the sentence.
137
+ # Multiple graphs may be returned as the function does not insert an
138
+ # empty dummy root node. Each graph is represented as a hash of hashes.
139
+ # Each hash contains the ID of the token, its relation (to its
140
+ # syntatically dominating token) and a list of secondary edges.
141
+ #
142
+ # @return [Array] zero or more syntax graphs represented as hashes
143
+ #
144
+ # @example Get a single syntax graph with a dummy root node
145
+ #
146
+ # sentence.syntax_graphs # => [{ id: 1000, relation: "pred", children: [ { id: 1001, relation: "xcomp", children: [], slashes: [["xsub", 1000]]}]}]
147
+ #
148
+ def syntax_graphs
149
+ Array.new.tap do |graphs|
150
+ token_map = {}
151
+
152
+ # Pass 1: create new attribute hashes for each token and index each hash by token ID
153
+ @children.each do |token|
154
+ token_map[token.id] =
155
+ {
156
+ id: token.id,
157
+ relation: token.relation,
158
+ children: [],
159
+ slashes: token.slashes,
160
+ }
161
+ end
162
+
163
+ # Pass 2: append attribute hashes for tokens with a head ID to the head's children list; append attribute hashes for tokens without a head ID to the list of graphs to return
164
+ @children.each do |token|
165
+ if token.head_id
166
+ token_map[token.head_id][:children] << token_map[token.id]
167
+ else
168
+ graphs << token_map[token.id]
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ # Finds all tokens in the sentence.
175
+ #
176
+ # @return [Enumerator] tokens in the sentence
177
+ #
178
+ # @example Iterating tokens
179
+ # tokens.each { |t| puts t.id }
180
+ #
181
+ # @example Create an array with only empty tokens
182
+ # tokens.select(&:is_empty?)
183
+ #
184
+ # @example Counting tokens
185
+ # puts tokens.count #=> 200
186
+ #
187
+ def tokens
188
+ @children.to_enum
189
+ end
190
+ end
191
+ end