proiel 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,214 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # Schema mismatch error.
8
+ #
9
+ # This represents an error that occurs when a treebank source is loaded
10
+ # into a {Treebank} object that already contains sources defined with an
11
+ # incompatible schema.
12
+ class SchemaMismatch < RuntimeError; end
13
+
14
+ # A class representing a PROIEL treebank containing any number of sources.
15
+ # The sources must use the same annotation scheme.
16
+ class Treebank
17
+ # @return [AnnotationSchema] annotation schema for the treebank
18
+ attr_reader :annotation_schema
19
+
20
+ # @return [String] PROIEL XML schema version for the treebank
21
+ attr_reader :schema_version
22
+
23
+ # @return [Array<Source>] sources in the treebank
24
+ attr_reader :sources
25
+
26
+ # Available metadata elements for sources.
27
+ METADATA_ELEMENTS = %i(
28
+ title
29
+ author
30
+ citation_part
31
+ principal
32
+ funder
33
+ distributor
34
+ distributor_address
35
+ date
36
+ license
37
+ license_url
38
+ reference_system
39
+ editor
40
+ editorial_note
41
+ annotator
42
+ reviewer
43
+ electronic_text_editor
44
+ electronic_text_title
45
+ electronic_text_version
46
+ electronic_text_publisher
47
+ electronic_text_place
48
+ electronic_text_date
49
+ electronic_text_original_url
50
+ electronic_text_license
51
+ electronic_text_license_url
52
+ printed_text_editor
53
+ printed_text_title
54
+ printed_text_edition
55
+ printed_text_publisher
56
+ printed_text_place
57
+ printed_text_date
58
+ )
59
+
60
+ # Creates a new treebank object.
61
+ def initialize
62
+ @annotation_schema = nil
63
+ @schema_version = nil
64
+ @sources = []
65
+
66
+ @source_index = {}
67
+ @div_index = {}
68
+ @sentence_index = {}
69
+ @token_index = {}
70
+ end
71
+
72
+ # Loads one or more PROIEL XML files.
73
+ #
74
+ # @param f [String, IO, Array] PROIEL XML files to load
75
+ #
76
+ # @return [Treebank] treebank object
77
+ #
78
+ def load_from_xml(f)
79
+ case f
80
+ when Array
81
+ f.each { |filename| load_from_xml(filename) }
82
+ when String
83
+ load_from_xml(File.open(f))
84
+ when IO
85
+ tf = PROIELXML::Reader.parse_io(f)
86
+
87
+ tf.proiel.sources.each do |s|
88
+ @sources << Source.new(self, s.id, tf.proiel.export_time, s.language,
89
+ bundle_metadata(s)) do |source|
90
+ build_divs(s, source)
91
+ end
92
+
93
+ index_objects!(@sources.last)
94
+ end
95
+
96
+ annotation_schema = AnnotationSchema.new(tf.proiel.annotation)
97
+ schema_version = tf.proiel.schema_version
98
+
99
+ @annotation_schema ||= annotation_schema
100
+ @schema_version ||= schema_version
101
+
102
+ if @annotation_schema == annotation_schema and @schema_version == schema_version
103
+ # FIXME: consolidate export times? This is a design flaw in PROIEL XML
104
+ # 2.0: export time ought to be per source not per PROIEL XML file, so
105
+ # not clear what to do here. Pass it down to the source object?
106
+ #@export_time = tf.proiel.export_time
107
+ else
108
+ raise SchemaMismatch
109
+ end
110
+ else
111
+ raise ArgumentError, 'expected filename, IO or array of these'
112
+ end
113
+
114
+ self
115
+ end
116
+
117
+ # Finds the {Source} object corresponding to a source ID.
118
+ #
119
+ # @param id [String]
120
+ #
121
+ # @return [nil, Source]
122
+ def find_source(id)
123
+ raise ArgumentError, 'string expected' unless id.is_a?(String)
124
+
125
+ @source_index[id]
126
+ end
127
+
128
+ # Finds the {Div} object corresponding to a div ID.
129
+ #
130
+ # @param id [Integer]
131
+ #
132
+ # @return [nil, Div]
133
+ def find_div(id)
134
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
135
+
136
+ @div_index[id]
137
+ end
138
+
139
+ # Finds the {Sentence} object corresponding to a sentence ID.
140
+ #
141
+ # @param id [Integer]
142
+ #
143
+ # @return [nil, Sentence]
144
+ def find_sentence(id)
145
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
146
+
147
+ @sentence_index[id]
148
+ end
149
+
150
+ # Finds the {Token} object corresponding to a token ID.
151
+ #
152
+ # @param id [Integer]
153
+ #
154
+ # @return [nil, Token]
155
+ def find_token(id)
156
+ raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
157
+
158
+ @token_index[id]
159
+ end
160
+
161
+ private
162
+
163
+ def bundle_metadata(s)
164
+ METADATA_ELEMENTS.map { |f| [f, s.send(f)] }.to_h
165
+ end
166
+
167
+ def build_divs(s, source)
168
+ # FIXME: for PROIEL XML > 2.0, we should respect d.id
169
+ s.divs.each_with_index.map do |d, i|
170
+ Div.new(source, i + 1, d.title, d.presentation_before,
171
+ d.presentation_after) do |div|
172
+ build_sentences(d, div)
173
+ end
174
+ end
175
+ end
176
+
177
+ def build_sentences(d, div)
178
+ d.sentences.map do |e|
179
+ Sentence.new(div, e.id, e.status, e.presentation_before,
180
+ e.presentation_after) do |sentence|
181
+ build_tokens(e, sentence)
182
+ end
183
+ end
184
+ end
185
+
186
+ def build_tokens(e, sentence)
187
+ e.tokens.map do |t|
188
+ Token.new(sentence, t.id, t.head_id, t.form, t.lemma,
189
+ t.part_of_speech, t.morphology, t.relation,
190
+ t.empty_token_sort, t.citation_part,
191
+ t.presentation_before, t.presentation_after,
192
+ t.antecedent_id, t.information_status,
193
+ t.contrast_group, t.foreign_ids,
194
+ t.slashes)
195
+ end
196
+ end
197
+
198
+ def index_objects!(source)
199
+ @source_index[source.id] = source
200
+
201
+ source.divs.each do |div|
202
+ @div_index[div.id] = div
203
+
204
+ div.sentences.each do |sentence|
205
+ @sentence_index[sentence.id] = sentence
206
+
207
+ sentence.tokens.each do |token|
208
+ @token_index[token.id] = token
209
+ end
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # An object in a treebank.
8
+ #
9
+ # @abstract
10
+ class TreebankObject
11
+ # Returns a string containing a human-readable representation of the object.
12
+ #
13
+ # This implementation provides only minimal information about the object
14
+ # and prevents (potentially infinite) recursion into the object tree.
15
+ #
16
+ # @return [String]
17
+ def inspect
18
+ "#<#{self.class} @id=#{id.inspect}>"
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,9 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ # Gem version
8
+ VERSION = '1.0.0'
9
+ end
data/lib/proiel.rb ADDED
@@ -0,0 +1,28 @@
1
+ #--
2
+ # Copyright (c) 2015 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ require 'date'
7
+ require 'json'
8
+ require 'zlib'
9
+ require 'ostruct'
10
+ require 'sax-machine'
11
+ require 'memoist'
12
+ require 'nokogiri'
13
+
14
+ require 'proiel/version'
15
+ require 'proiel/citations'
16
+ require 'proiel/statistics'
17
+ require 'proiel/tokenization'
18
+ require 'proiel/positional_tag'
19
+ require 'proiel/proiel_xml/reader'
20
+ require 'proiel/proiel_xml/validator'
21
+ require 'proiel/proiel_xml/schema'
22
+ require 'proiel/treebank'
23
+ require 'proiel/annotation_schema'
24
+ require 'proiel/treebank_object'
25
+ require 'proiel/source'
26
+ require 'proiel/div'
27
+ require 'proiel/sentence'
28
+ require 'proiel/token'
metadata ADDED
@@ -0,0 +1,210 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: proiel
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Marius L. Jøhndal
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: json
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.6
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.6.6
41
+ - !ruby/object:Gem::Dependency
42
+ name: sax-machine
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 1.3.2
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 1.3.2
55
+ - !ruby/object:Gem::Dependency
56
+ name: memoist
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.12'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.12'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.10'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.10'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.2'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.2'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '0.10'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '0.10'
125
+ - !ruby/object:Gem::Dependency
126
+ name: simplecov
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.10'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.10'
139
+ - !ruby/object:Gem::Dependency
140
+ name: yard
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 0.8.7
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.8.7
153
+ description: This provides a library of functions for reading and manipulating treebanks
154
+ using the PROIEL dependency format.
155
+ email:
156
+ - mariuslj@ifi.uio.no
157
+ executables: []
158
+ extensions: []
159
+ extra_rdoc_files: []
160
+ files:
161
+ - LICENSE
162
+ - README.md
163
+ - bin/console
164
+ - bin/setup
165
+ - lib/proiel.rb
166
+ - lib/proiel/annotation_schema.rb
167
+ - lib/proiel/citations.rb
168
+ - lib/proiel/div.rb
169
+ - lib/proiel/positional_tag.rb
170
+ - lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
171
+ - lib/proiel/proiel_xml/proiel-1.0/teilite.xsd
172
+ - lib/proiel/proiel_xml/proiel-1.0/xml.xsd
173
+ - lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd
174
+ - lib/proiel/proiel_xml/reader.rb
175
+ - lib/proiel/proiel_xml/schema.rb
176
+ - lib/proiel/proiel_xml/validator.rb
177
+ - lib/proiel/sentence.rb
178
+ - lib/proiel/source.rb
179
+ - lib/proiel/statistics.rb
180
+ - lib/proiel/token.rb
181
+ - lib/proiel/tokenization.rb
182
+ - lib/proiel/treebank.rb
183
+ - lib/proiel/treebank_object.rb
184
+ - lib/proiel/version.rb
185
+ homepage: http://proiel.github.com
186
+ licenses:
187
+ - MIT
188
+ metadata: {}
189
+ post_install_message:
190
+ rdoc_options: []
191
+ require_paths:
192
+ - lib
193
+ required_ruby_version: !ruby/object:Gem::Requirement
194
+ requirements:
195
+ - - ">="
196
+ - !ruby/object:Gem::Version
197
+ version: '2.1'
198
+ required_rubygems_version: !ruby/object:Gem::Requirement
199
+ requirements:
200
+ - - ">="
201
+ - !ruby/object:Gem::Version
202
+ version: '0'
203
+ requirements: []
204
+ rubyforge_project:
205
+ rubygems_version: 2.4.5.1
206
+ signing_key:
207
+ specification_version: 4
208
+ summary: A library for working with treebanks using the PROIEL dependency format
209
+ test_files: []
210
+ has_rdoc: