proiel 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +99 -0
- data/bin/console +6 -0
- data/bin/setup +5 -0
- data/lib/proiel/annotation_schema.rb +127 -0
- data/lib/proiel/citations.rb +84 -0
- data/lib/proiel/div.rb +133 -0
- data/lib/proiel/positional_tag.rb +127 -0
- data/lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd +172 -0
- data/lib/proiel/proiel_xml/proiel-1.0/teilite.xsd +7387 -0
- data/lib/proiel/proiel_xml/proiel-1.0/xml.xsd +287 -0
- data/lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd +185 -0
- data/lib/proiel/proiel_xml/reader.rb +237 -0
- data/lib/proiel/proiel_xml/schema.rb +81 -0
- data/lib/proiel/proiel_xml/validator.rb +177 -0
- data/lib/proiel/sentence.rb +191 -0
- data/lib/proiel/source.rb +114 -0
- data/lib/proiel/statistics.rb +41 -0
- data/lib/proiel/token.rb +407 -0
- data/lib/proiel/tokenization.rb +90 -0
- data/lib/proiel/treebank.rb +214 -0
- data/lib/proiel/treebank_object.rb +21 -0
- data/lib/proiel/version.rb +9 -0
- data/lib/proiel.rb +28 -0
- metadata +210 -0
@@ -0,0 +1,214 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
# Schema mismatch error.
|
8
|
+
#
|
9
|
+
# This represents an error that occurs when a treebank source is loaded
|
10
|
+
# into a {Treebank} object that already contains sources defined with an
|
11
|
+
# incompatible schema.
|
12
|
+
class SchemaMismatch < RuntimeError; end
|
13
|
+
|
14
|
+
# A class representing a PROIEL treebank containing any number of sources.
|
15
|
+
# The sources must use the same annotation scheme.
|
16
|
+
class Treebank
|
17
|
+
# @return [AnnotationSchema] annotation schema for the treebank
|
18
|
+
attr_reader :annotation_schema
|
19
|
+
|
20
|
+
# @return [String] PROIEL XML schema version for the treebank
|
21
|
+
attr_reader :schema_version
|
22
|
+
|
23
|
+
# @return [Array<Source>] sources in the treebank
|
24
|
+
attr_reader :sources
|
25
|
+
|
26
|
+
# Available metadata elements for sources.
|
27
|
+
METADATA_ELEMENTS = %i(
|
28
|
+
title
|
29
|
+
author
|
30
|
+
citation_part
|
31
|
+
principal
|
32
|
+
funder
|
33
|
+
distributor
|
34
|
+
distributor_address
|
35
|
+
date
|
36
|
+
license
|
37
|
+
license_url
|
38
|
+
reference_system
|
39
|
+
editor
|
40
|
+
editorial_note
|
41
|
+
annotator
|
42
|
+
reviewer
|
43
|
+
electronic_text_editor
|
44
|
+
electronic_text_title
|
45
|
+
electronic_text_version
|
46
|
+
electronic_text_publisher
|
47
|
+
electronic_text_place
|
48
|
+
electronic_text_date
|
49
|
+
electronic_text_original_url
|
50
|
+
electronic_text_license
|
51
|
+
electronic_text_license_url
|
52
|
+
printed_text_editor
|
53
|
+
printed_text_title
|
54
|
+
printed_text_edition
|
55
|
+
printed_text_publisher
|
56
|
+
printed_text_place
|
57
|
+
printed_text_date
|
58
|
+
)
|
59
|
+
|
60
|
+
# Creates a new treebank object.
|
61
|
+
def initialize
|
62
|
+
@annotation_schema = nil
|
63
|
+
@schema_version = nil
|
64
|
+
@sources = []
|
65
|
+
|
66
|
+
@source_index = {}
|
67
|
+
@div_index = {}
|
68
|
+
@sentence_index = {}
|
69
|
+
@token_index = {}
|
70
|
+
end
|
71
|
+
|
72
|
+
# Loads one or more PROIEL XML files.
|
73
|
+
#
|
74
|
+
# @param f [String, IO, Array] PROIEL XML files to load
|
75
|
+
#
|
76
|
+
# @return [Treebank] treebank object
|
77
|
+
#
|
78
|
+
def load_from_xml(f)
|
79
|
+
case f
|
80
|
+
when Array
|
81
|
+
f.each { |filename| load_from_xml(filename) }
|
82
|
+
when String
|
83
|
+
load_from_xml(File.open(f))
|
84
|
+
when IO
|
85
|
+
tf = PROIELXML::Reader.parse_io(f)
|
86
|
+
|
87
|
+
tf.proiel.sources.each do |s|
|
88
|
+
@sources << Source.new(self, s.id, tf.proiel.export_time, s.language,
|
89
|
+
bundle_metadata(s)) do |source|
|
90
|
+
build_divs(s, source)
|
91
|
+
end
|
92
|
+
|
93
|
+
index_objects!(@sources.last)
|
94
|
+
end
|
95
|
+
|
96
|
+
annotation_schema = AnnotationSchema.new(tf.proiel.annotation)
|
97
|
+
schema_version = tf.proiel.schema_version
|
98
|
+
|
99
|
+
@annotation_schema ||= annotation_schema
|
100
|
+
@schema_version ||= schema_version
|
101
|
+
|
102
|
+
if @annotation_schema == annotation_schema and @schema_version == schema_version
|
103
|
+
# FIXME: consolidate export times? This is a design flaw in PROIEL XML
|
104
|
+
# 2.0: export time ought to be per source not per PROIEL XML file, so
|
105
|
+
# not clear what to do here. Pass it down to the source object?
|
106
|
+
#@export_time = tf.proiel.export_time
|
107
|
+
else
|
108
|
+
raise SchemaMismatch
|
109
|
+
end
|
110
|
+
else
|
111
|
+
raise ArgumentError, 'expected filename, IO or array of these'
|
112
|
+
end
|
113
|
+
|
114
|
+
self
|
115
|
+
end
|
116
|
+
|
117
|
+
# Finds the {Source} object corresponding to a source ID.
|
118
|
+
#
|
119
|
+
# @param id [String]
|
120
|
+
#
|
121
|
+
# @return [nil, Source]
|
122
|
+
def find_source(id)
|
123
|
+
raise ArgumentError, 'string expected' unless id.is_a?(String)
|
124
|
+
|
125
|
+
@source_index[id]
|
126
|
+
end
|
127
|
+
|
128
|
+
# Finds the {Div} object corresponding to a div ID.
|
129
|
+
#
|
130
|
+
# @param id [Integer]
|
131
|
+
#
|
132
|
+
# @return [nil, Div]
|
133
|
+
def find_div(id)
|
134
|
+
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
135
|
+
|
136
|
+
@div_index[id]
|
137
|
+
end
|
138
|
+
|
139
|
+
# Finds the {Sentence} object corresponding to a sentence ID.
|
140
|
+
#
|
141
|
+
# @param id [Integer]
|
142
|
+
#
|
143
|
+
# @return [nil, Sentence]
|
144
|
+
def find_sentence(id)
|
145
|
+
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
146
|
+
|
147
|
+
@sentence_index[id]
|
148
|
+
end
|
149
|
+
|
150
|
+
# Finds the {Token} object corresponding to a token ID.
|
151
|
+
#
|
152
|
+
# @param id [Integer]
|
153
|
+
#
|
154
|
+
# @return [nil, Token]
|
155
|
+
def find_token(id)
|
156
|
+
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
157
|
+
|
158
|
+
@token_index[id]
|
159
|
+
end
|
160
|
+
|
161
|
+
private
|
162
|
+
|
163
|
+
def bundle_metadata(s)
|
164
|
+
METADATA_ELEMENTS.map { |f| [f, s.send(f)] }.to_h
|
165
|
+
end
|
166
|
+
|
167
|
+
def build_divs(s, source)
|
168
|
+
# FIXME: for PROIEL XML > 2.0, we should respect d.id
|
169
|
+
s.divs.each_with_index.map do |d, i|
|
170
|
+
Div.new(source, i + 1, d.title, d.presentation_before,
|
171
|
+
d.presentation_after) do |div|
|
172
|
+
build_sentences(d, div)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def build_sentences(d, div)
|
178
|
+
d.sentences.map do |e|
|
179
|
+
Sentence.new(div, e.id, e.status, e.presentation_before,
|
180
|
+
e.presentation_after) do |sentence|
|
181
|
+
build_tokens(e, sentence)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def build_tokens(e, sentence)
|
187
|
+
e.tokens.map do |t|
|
188
|
+
Token.new(sentence, t.id, t.head_id, t.form, t.lemma,
|
189
|
+
t.part_of_speech, t.morphology, t.relation,
|
190
|
+
t.empty_token_sort, t.citation_part,
|
191
|
+
t.presentation_before, t.presentation_after,
|
192
|
+
t.antecedent_id, t.information_status,
|
193
|
+
t.contrast_group, t.foreign_ids,
|
194
|
+
t.slashes)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def index_objects!(source)
|
199
|
+
@source_index[source.id] = source
|
200
|
+
|
201
|
+
source.divs.each do |div|
|
202
|
+
@div_index[div.id] = div
|
203
|
+
|
204
|
+
div.sentences.each do |sentence|
|
205
|
+
@sentence_index[sentence.id] = sentence
|
206
|
+
|
207
|
+
sentence.tokens.each do |token|
|
208
|
+
@token_index[token.id] = token
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
# An object in a treebank.
|
8
|
+
#
|
9
|
+
# @abstract
|
10
|
+
class TreebankObject
|
11
|
+
# Returns a string containing a human-readable representation of the object.
|
12
|
+
#
|
13
|
+
# This implementation provides only minimal information about the object
|
14
|
+
# and prevents (potentially infinite) recursion into the object tree.
|
15
|
+
#
|
16
|
+
# @return [String]
|
17
|
+
def inspect
|
18
|
+
"#<#{self.class} @id=#{id.inspect}>"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/proiel.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2015 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
require 'date'
|
7
|
+
require 'json'
|
8
|
+
require 'zlib'
|
9
|
+
require 'ostruct'
|
10
|
+
require 'sax-machine'
|
11
|
+
require 'memoist'
|
12
|
+
require 'nokogiri'
|
13
|
+
|
14
|
+
require 'proiel/version'
|
15
|
+
require 'proiel/citations'
|
16
|
+
require 'proiel/statistics'
|
17
|
+
require 'proiel/tokenization'
|
18
|
+
require 'proiel/positional_tag'
|
19
|
+
require 'proiel/proiel_xml/reader'
|
20
|
+
require 'proiel/proiel_xml/validator'
|
21
|
+
require 'proiel/proiel_xml/schema'
|
22
|
+
require 'proiel/treebank'
|
23
|
+
require 'proiel/annotation_schema'
|
24
|
+
require 'proiel/treebank_object'
|
25
|
+
require 'proiel/source'
|
26
|
+
require 'proiel/div'
|
27
|
+
require 'proiel/sentence'
|
28
|
+
require 'proiel/token'
|
metadata
ADDED
@@ -0,0 +1,210 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: proiel
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Marius L. Jøhndal
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-10-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: json
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.8'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.8'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.6.6
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.6.6
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: sax-machine
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.3.2
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.3.2
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: memoist
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.12'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.12'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.10'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.10'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '3.2'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '3.2'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: pry
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.10'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.10'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: simplecov
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0.10'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0.10'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: yard
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: 0.8.7
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.8.7
|
153
|
+
description: This provides a library of functions for reading and manipulating treebanks
|
154
|
+
using the PROIEL dependency format.
|
155
|
+
email:
|
156
|
+
- mariuslj@ifi.uio.no
|
157
|
+
executables: []
|
158
|
+
extensions: []
|
159
|
+
extra_rdoc_files: []
|
160
|
+
files:
|
161
|
+
- LICENSE
|
162
|
+
- README.md
|
163
|
+
- bin/console
|
164
|
+
- bin/setup
|
165
|
+
- lib/proiel.rb
|
166
|
+
- lib/proiel/annotation_schema.rb
|
167
|
+
- lib/proiel/citations.rb
|
168
|
+
- lib/proiel/div.rb
|
169
|
+
- lib/proiel/positional_tag.rb
|
170
|
+
- lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
|
171
|
+
- lib/proiel/proiel_xml/proiel-1.0/teilite.xsd
|
172
|
+
- lib/proiel/proiel_xml/proiel-1.0/xml.xsd
|
173
|
+
- lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd
|
174
|
+
- lib/proiel/proiel_xml/reader.rb
|
175
|
+
- lib/proiel/proiel_xml/schema.rb
|
176
|
+
- lib/proiel/proiel_xml/validator.rb
|
177
|
+
- lib/proiel/sentence.rb
|
178
|
+
- lib/proiel/source.rb
|
179
|
+
- lib/proiel/statistics.rb
|
180
|
+
- lib/proiel/token.rb
|
181
|
+
- lib/proiel/tokenization.rb
|
182
|
+
- lib/proiel/treebank.rb
|
183
|
+
- lib/proiel/treebank_object.rb
|
184
|
+
- lib/proiel/version.rb
|
185
|
+
homepage: http://proiel.github.com
|
186
|
+
licenses:
|
187
|
+
- MIT
|
188
|
+
metadata: {}
|
189
|
+
post_install_message:
|
190
|
+
rdoc_options: []
|
191
|
+
require_paths:
|
192
|
+
- lib
|
193
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
194
|
+
requirements:
|
195
|
+
- - ">="
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
version: '2.1'
|
198
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
199
|
+
requirements:
|
200
|
+
- - ">="
|
201
|
+
- !ruby/object:Gem::Version
|
202
|
+
version: '0'
|
203
|
+
requirements: []
|
204
|
+
rubyforge_project:
|
205
|
+
rubygems_version: 2.4.5.1
|
206
|
+
signing_key:
|
207
|
+
specification_version: 4
|
208
|
+
summary: A library for working with treebanks using the PROIEL dependency format
|
209
|
+
test_files: []
|
210
|
+
has_rdoc:
|