proiel 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,214 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # Schema mismatch error.
8
+ #
9
+ # This represents an error that occurs when a treebank source is loaded
10
+ # into a {Treebank} object that already contains sources defined with an
11
+ # incompatible schema.
12
+ class SchemaMismatch < RuntimeError; end
13
+
14
+ # A class representing a PROIEL treebank containing any number of sources.
15
+ # The sources must use the same annotation scheme.
16
+ class Treebank
17
+ # @return [AnnotationSchema] annotation schema for the treebank
18
+ attr_reader :annotation_schema
19
+
20
+ # @return [String] PROIEL XML schema version for the treebank
21
+ attr_reader :schema_version
22
+
23
+ # @return [Array<Source>] sources in the treebank
24
+ attr_reader :sources
25
+
26
+ # Available metadata elements for sources.
27
+ METADATA_ELEMENTS = %i(
28
+ title
29
+ author
30
+ citation_part
31
+ principal
32
+ funder
33
+ distributor
34
+ distributor_address
35
+ date
36
+ license
37
+ license_url
38
+ reference_system
39
+ editor
40
+ editorial_note
41
+ annotator
42
+ reviewer
43
+ electronic_text_editor
44
+ electronic_text_title
45
+ electronic_text_version
46
+ electronic_text_publisher
47
+ electronic_text_place
48
+ electronic_text_date
49
+ electronic_text_original_url
50
+ electronic_text_license
51
+ electronic_text_license_url
52
+ printed_text_editor
53
+ printed_text_title
54
+ printed_text_edition
55
+ printed_text_publisher
56
+ printed_text_place
57
+ printed_text_date
58
+ )
59
+
60
+ # Creates a new treebank object.
61
+ def initialize
62
+ @annotation_schema = nil
63
+ @schema_version = nil
64
+ @sources = []
65
+
66
+ @source_index = {}
67
+ @div_index = {}
68
+ @sentence_index = {}
69
+ @token_index = {}
70
+ end
71
+
72
+ # Loads one or more PROIEL XML files.
73
+ #
74
+ # @param f [String, IO, Array] PROIEL XML files to load
75
+ #
76
+ # @return [Treebank] treebank object
77
+ #
78
+ def load_from_xml(f)
79
+ case f
80
+ when Array
81
+ f.each { |filename| load_from_xml(filename) }
82
+ when String
83
+ load_from_xml(File.open(f))
84
+ when IO
85
+ tf = PROIELXML::Reader.parse_io(f)
86
+
87
+ tf.proiel.sources.each do |s|
88
+ @sources << Source.new(self, s.id, tf.proiel.export_time, s.language,
89
+ bundle_metadata(s)) do |source|
90
+ build_divs(s, source)
91
+ end
92
+
93
+ index_objects!(@sources.last)
94
+ end
95
+
96
+ annotation_schema = AnnotationSchema.new(tf.proiel.annotation)
97
+ schema_version = tf.proiel.schema_version
98
+
99
+ @annotation_schema ||= annotation_schema
100
+ @schema_version ||= schema_version
101
+
102
+ if @annotation_schema == annotation_schema and @schema_version == schema_version
103
+ # FIXME: consolidate export times? This is a design flaw in PROIEL XML
104
+ # 2.0: export time ought to be per source not per PROIEL XML file, so
105
+ # not clear what to do here. Pass it down to the source object?
106
+ #@export_time = tf.proiel.export_time
107
+ else
108
+ raise SchemaMismatch
109
+ end
110
+ else
111
+ raise ArgumentError, 'expected filename, IO or array of these'
112
+ end
113
+
114
+ self
115
+ end
116
+
117
+ # Finds the {Source} object corresponding to a source ID.
118
+ #
119
+ # @param id [String]
120
+ #
121
+ # @return [nil, Source]
122
+ def find_source(id)
123
+ raise ArgumentError, 'string expected' unless id.is_a?(String)
124
+
125
+ @source_index[id]
126
+ end
127
+
128
+ # Finds the {Div} object corresponding to a div ID.
129
+ #
130
+ # @param id [Integer]
131
+ #
132
+ # @return [nil, Div]
133
+ def find_div(id)
134
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
135
+
136
+ @div_index[id]
137
+ end
138
+
139
+ # Finds the {Sentence} object corresponding to a sentence ID.
140
+ #
141
+ # @param id [Integer]
142
+ #
143
+ # @return [nil, Sentence]
144
+ def find_sentence(id)
145
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
146
+
147
+ @sentence_index[id]
148
+ end
149
+
150
+ # Finds the {Token} object corresponding to a token ID.
151
+ #
152
+ # @param id [Integer]
153
+ #
154
+ # @return [nil, Token]
155
+ def find_token(id)
156
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
157
+
158
+ @token_index[id]
159
+ end
160
+
161
+ private
162
+
163
+ def bundle_metadata(s)
164
+ METADATA_ELEMENTS.map { |f| [f, s.send(f)] }.to_h
165
+ end
166
+
167
+ def build_divs(s, source)
168
+ # FIXME: for PROIEL XML > 2.0, we should respect d.id
169
+ s.divs.each_with_index.map do |d, i|
170
+ Div.new(source, i + 1, d.title, d.presentation_before,
171
+ d.presentation_after) do |div|
172
+ build_sentences(d, div)
173
+ end
174
+ end
175
+ end
176
+
177
+ def build_sentences(d, div)
178
+ d.sentences.map do |e|
179
+ Sentence.new(div, e.id, e.status, e.presentation_before,
180
+ e.presentation_after) do |sentence|
181
+ build_tokens(e, sentence)
182
+ end
183
+ end
184
+ end
185
+
186
+ def build_tokens(e, sentence)
187
+ e.tokens.map do |t|
188
+ Token.new(sentence, t.id, t.head_id, t.form, t.lemma,
189
+ t.part_of_speech, t.morphology, t.relation,
190
+ t.empty_token_sort, t.citation_part,
191
+ t.presentation_before, t.presentation_after,
192
+ t.antecedent_id, t.information_status,
193
+ t.contrast_group, t.foreign_ids,
194
+ t.slashes)
195
+ end
196
+ end
197
+
198
+ def index_objects!(source)
199
+ @source_index[source.id] = source
200
+
201
+ source.divs.each do |div|
202
+ @div_index[div.id] = div
203
+
204
+ div.sentences.each do |sentence|
205
+ @sentence_index[sentence.id] = sentence
206
+
207
+ sentence.tokens.each do |token|
208
+ @token_index[token.id] = token
209
+ end
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # An object in a treebank.
8
+ #
9
+ # @abstract
10
+ class TreebankObject
11
+ # Returns a string containing a human-readable representation of the object.
12
+ #
13
+ # This implementation provides only minimal information about the object
14
+ # and prevents (potentially infinite) recursion into the object tree.
15
+ #
16
+ # @return [String]
17
+ def inspect
18
+ "#<#{self.class} @id=#{id.inspect}>"
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,9 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # Gem version
8
+ VERSION = '1.0.0'
9
+ end
data/lib/proiel.rb ADDED
@@ -0,0 +1,28 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ require 'date'
7
+ require 'json'
8
+ require 'zlib'
9
+ require 'ostruct'
10
+ require 'sax-machine'
11
+ require 'memoist'
12
+ require 'nokogiri'
13
+
14
+ require 'proiel/version'
15
+ require 'proiel/citations'
16
+ require 'proiel/statistics'
17
+ require 'proiel/tokenization'
18
+ require 'proiel/positional_tag'
19
+ require 'proiel/proiel_xml/reader'
20
+ require 'proiel/proiel_xml/validator'
21
+ require 'proiel/proiel_xml/schema'
22
+ require 'proiel/treebank'
23
+ require 'proiel/annotation_schema'
24
+ require 'proiel/treebank_object'
25
+ require 'proiel/source'
26
+ require 'proiel/div'
27
+ require 'proiel/sentence'
28
+ require 'proiel/token'
metadata ADDED
@@ -0,0 +1,210 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: proiel
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Marius L. Jøhndal
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: json
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.6
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.6.6
41
+ - !ruby/object:Gem::Dependency
42
+ name: sax-machine
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 1.3.2
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 1.3.2
55
+ - !ruby/object:Gem::Dependency
56
+ name: memoist
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.12'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.12'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.10'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.10'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.2'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.2'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '0.10'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '0.10'
125
+ - !ruby/object:Gem::Dependency
126
+ name: simplecov
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.10'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.10'
139
+ - !ruby/object:Gem::Dependency
140
+ name: yard
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 0.8.7
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.8.7
153
+ description: This provides a library of functions for reading and manipulating treebanks
154
+ using the PROIEL dependency format.
155
+ email:
156
+ - mariuslj@ifi.uio.no
157
+ executables: []
158
+ extensions: []
159
+ extra_rdoc_files: []
160
+ files:
161
+ - LICENSE
162
+ - README.md
163
+ - bin/console
164
+ - bin/setup
165
+ - lib/proiel.rb
166
+ - lib/proiel/annotation_schema.rb
167
+ - lib/proiel/citations.rb
168
+ - lib/proiel/div.rb
169
+ - lib/proiel/positional_tag.rb
170
+ - lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
171
+ - lib/proiel/proiel_xml/proiel-1.0/teilite.xsd
172
+ - lib/proiel/proiel_xml/proiel-1.0/xml.xsd
173
+ - lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd
174
+ - lib/proiel/proiel_xml/reader.rb
175
+ - lib/proiel/proiel_xml/schema.rb
176
+ - lib/proiel/proiel_xml/validator.rb
177
+ - lib/proiel/sentence.rb
178
+ - lib/proiel/source.rb
179
+ - lib/proiel/statistics.rb
180
+ - lib/proiel/token.rb
181
+ - lib/proiel/tokenization.rb
182
+ - lib/proiel/treebank.rb
183
+ - lib/proiel/treebank_object.rb
184
+ - lib/proiel/version.rb
185
+ homepage: http://proiel.github.com
186
+ licenses:
187
+ - MIT
188
+ metadata: {}
189
+ post_install_message:
190
+ rdoc_options: []
191
+ require_paths:
192
+ - lib
193
+ required_ruby_version: !ruby/object:Gem::Requirement
194
+ requirements:
195
+ - - ">="
196
+ - !ruby/object:Gem::Version
197
+ version: '2.1'
198
+ required_rubygems_version: !ruby/object:Gem::Requirement
199
+ requirements:
200
+ - - ">="
201
+ - !ruby/object:Gem::Version
202
+ version: '0'
203
+ requirements: []
204
+ rubyforge_project:
205
+ rubygems_version: 2.4.5.1
206
+ signing_key:
207
+ specification_version: 4
208
+ summary: A library for working with treebanks using the PROIEL dependency format
209
+ test_files: []
210
+ has_rdoc: