proiel 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/proiel/div.rb +8 -2
- data/lib/proiel/proiel_xml/proiel-2.1/proiel-2.1.xsd +198 -0
- data/lib/proiel/proiel_xml/reader.rb +10 -2
- data/lib/proiel/proiel_xml/schema.rb +4 -2
- data/lib/proiel/proiel_xml/validator.rb +10 -0
- data/lib/proiel/sentence.rb +32 -2
- data/lib/proiel/source.rb +10 -3
- data/lib/proiel/token.rb +9 -2
- data/lib/proiel/treebank.rb +9 -6
- data/lib/proiel/utils.rb +16 -0
- data/lib/proiel/version.rb +1 -1
- data/lib/proiel.rb +2 -1
- metadata +4 -7
- data/lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.md +0 -16
- data/lib/proiel/proiel_xml/proiel-2.0/XMLSchema.xsd +0 -2492
- data/lib/proiel/proiel_xml/proiel-2.0/make +0 -1
- data/lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.html +0 -1928
- data/lib/proiel/proiel_xml/proiel-2.0/xs3p.xsl +0 -8520
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39313c422eb3b2d2f3ad565c0cde3cbd3ddb5271
|
4
|
+
data.tar.gz: 1edadad95bbaad82d4d7ab1a9cc409f8e80d3a74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41bf1b5bcb3c8d8318128ea146b2609d02942d711553876d71c29cafc948312e79e8cd2e448fef751ca25c685c3f0d57a924004a46bdb3496a8f9913772e3e48
|
7
|
+
data.tar.gz: add1511098c62bdd4ee59fdd53e55b4b331595a5a5e02320e97dadff194e8b0b96fb24bd48511a02933d70a83208e0f5d40093a49eec959917177bf59589cbb7
|
data/lib/proiel/div.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -28,8 +28,11 @@ module PROIEL
|
|
28
28
|
# @return [nil, String] presentation material after form
|
29
29
|
attr_reader :presentation_after
|
30
30
|
|
31
|
+
# @return [nil, String] ID of the div that this div is aligned to
|
32
|
+
attr_reader :alignment_id
|
33
|
+
|
31
34
|
# Creates a new div object.
|
32
|
-
def initialize(parent, id, title, presentation_before, presentation_after, &block)
|
35
|
+
def initialize(parent, id, title, presentation_before, presentation_after, alignment_id, &block)
|
33
36
|
@source = parent
|
34
37
|
|
35
38
|
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
@@ -44,6 +47,9 @@ module PROIEL
|
|
44
47
|
raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
|
45
48
|
@presentation_after = presentation_after.freeze
|
46
49
|
|
50
|
+
raise ArgumentError, 'integer or nil expected' unless alignment_id.nil? or alignment_id.is_a?(Integer)
|
51
|
+
@alignment_id = alignment_id
|
52
|
+
|
47
53
|
@children = block.call(self) if block_given?
|
48
54
|
end
|
49
55
|
|
@@ -0,0 +1,198 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
|
3
|
+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
|
4
|
+
<xs:annotation>
|
5
|
+
<xs:documentation>PROIEL XML format version 2.1</xs:documentation>
|
6
|
+
</xs:annotation>
|
7
|
+
|
8
|
+
<xs:complexType name="Source">
|
9
|
+
<xs:sequence>
|
10
|
+
<xs:element name="title" minOccurs="1" maxOccurs="1" type="xs:string"/>
|
11
|
+
<xs:element name="author" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
12
|
+
<xs:element name="citation-part" minOccurs="1" maxOccurs="1" type="xs:string"/>
|
13
|
+
<xs:element name="principal" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
14
|
+
<xs:element name="funder" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
15
|
+
<xs:element name="distributor" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
16
|
+
<xs:element name="distributor-address" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
17
|
+
<xs:element name="address" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
18
|
+
<xs:element name="date" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
19
|
+
<xs:element name="license" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
20
|
+
<xs:element name="license-url" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
21
|
+
<xs:element name="reference-system" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
22
|
+
<xs:element name="editor" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
23
|
+
<xs:element name="editorial-note" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
24
|
+
<xs:element name="annotator" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
25
|
+
<xs:element name="reviewer" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
26
|
+
<xs:element name="electronic-text-editor" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
27
|
+
<xs:element name="electronic-text-title" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
28
|
+
<xs:element name="electronic-text-version" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
29
|
+
<xs:element name="electronic-text-publisher" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
30
|
+
<xs:element name="electronic-text-place" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
31
|
+
<xs:element name="electronic-text-date" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
32
|
+
<xs:element name="electronic-text-original-url" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
33
|
+
<xs:element name="electronic-text-license" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
34
|
+
<xs:element name="electronic-text-license-url" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
35
|
+
<xs:element name="printed-text-editor" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
36
|
+
<xs:element name="printed-text-title" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
37
|
+
<xs:element name="printed-text-edition" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
38
|
+
<xs:element name="printed-text-publisher" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
39
|
+
<xs:element name="printed-text-place" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
40
|
+
<xs:element name="printed-text-date" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
41
|
+
|
42
|
+
<xs:element name="div" minOccurs="1" maxOccurs="unbounded" type="Div"/>
|
43
|
+
</xs:sequence>
|
44
|
+
|
45
|
+
<xs:attribute name="id" type="xs:string" use="required"/>
|
46
|
+
<xs:attribute name="alignment-id" type="xs:string" use="optional"/>
|
47
|
+
<xs:attribute name="language" type="xs:string" use="required"/>
|
48
|
+
</xs:complexType>
|
49
|
+
|
50
|
+
<xs:complexType name="Div">
|
51
|
+
<xs:sequence>
|
52
|
+
<xs:element name="title" minOccurs="1" maxOccurs="1" type="xs:string"/>
|
53
|
+
<xs:element name="sentence" minOccurs="1" maxOccurs="unbounded" type="Sentence"/>
|
54
|
+
</xs:sequence>
|
55
|
+
|
56
|
+
<xs:attribute name="id" type="xs:nonNegativeInteger" use="optional"/>
|
57
|
+
<xs:attribute name="alignment-id" type="xs:nonNegativeInteger" use="optional"/>
|
58
|
+
<xs:attribute name="presentation-before" type="xs:string" use="optional"/>
|
59
|
+
<xs:attribute name="presentation-after" type="xs:string" use="optional"/>
|
60
|
+
</xs:complexType>
|
61
|
+
|
62
|
+
<xs:complexType name="Sentence">
|
63
|
+
<xs:sequence>
|
64
|
+
<xs:element name="token" minOccurs="1" maxOccurs="unbounded" type="Token"/>
|
65
|
+
</xs:sequence>
|
66
|
+
|
67
|
+
<xs:attribute name="id" type="xs:nonNegativeInteger" use="optional"/>
|
68
|
+
<xs:attribute name="alignment-id" type="xs:nonNegativeInteger" use="optional"/>
|
69
|
+
<xs:attribute name="presentation-before" type="xs:string" use="optional"/>
|
70
|
+
<xs:attribute name="presentation-after" type="xs:string" use="optional"/>
|
71
|
+
<xs:attribute name="status" type="SentenceStatus" use="optional"/>
|
72
|
+
<xs:attribute name="annotated-by" type="xs:string" use="optional"/>
|
73
|
+
<xs:attribute name="annotated-at" type="xs:dateTime" use="optional"/>
|
74
|
+
<xs:attribute name="reviewed-by" type="xs:string" use="optional"/>
|
75
|
+
<xs:attribute name="reviewed-at" type="xs:dateTime" use="optional"/>
|
76
|
+
</xs:complexType>
|
77
|
+
|
78
|
+
<xs:simpleType name="SentenceStatus">
|
79
|
+
<xs:restriction base="xs:string">
|
80
|
+
<xs:enumeration value="annotated"/>
|
81
|
+
<xs:enumeration value="reviewed"/>
|
82
|
+
<xs:enumeration value="unannotated"/>
|
83
|
+
</xs:restriction>
|
84
|
+
</xs:simpleType>
|
85
|
+
|
86
|
+
<xs:complexType name="Token">
|
87
|
+
<xs:sequence>
|
88
|
+
<xs:element name="slash" minOccurs="0" maxOccurs="unbounded" type="Slash"/>
|
89
|
+
</xs:sequence>
|
90
|
+
|
91
|
+
<xs:attribute name="id" type="xs:nonNegativeInteger" use="optional"/>
|
92
|
+
<xs:attribute name="alignment-id" type="xs:nonNegativeInteger" use="optional"/>
|
93
|
+
<xs:attribute name="lemma" type="xs:string" use="optional"/>
|
94
|
+
<xs:attribute name="part-of-speech" type="xs:string" use="optional"/>
|
95
|
+
<xs:attribute name="morphology" type="xs:string" use="optional"/>
|
96
|
+
<xs:attribute name="citation-part" type="xs:string" use="optional"/>
|
97
|
+
<xs:attribute name="relation" type="xs:string" use="optional"/>
|
98
|
+
<xs:attribute name="head-id" type="xs:nonNegativeInteger" use="optional"/>
|
99
|
+
<xs:attribute name="information-status" type="xs:string" use="optional"/>
|
100
|
+
<xs:attribute name="antecedent-id" type="xs:nonNegativeInteger" use="optional"/>
|
101
|
+
<xs:attribute name="contrast-group" type="xs:string" use="optional"/>
|
102
|
+
<xs:attribute name="foreign-ids" type="xs:string" use="optional"/>
|
103
|
+
|
104
|
+
<!-- XSD does not allow us to constrain the use of these attributes properly so they are marked optional even though their occurrence depends on the value of empty-token-sort and form. -->
|
105
|
+
<xs:attribute name="empty-token-sort" type="EmptyTokenSort" use="optional"/>
|
106
|
+
<xs:attribute name="form" type="xs:string" use="optional"/>
|
107
|
+
<xs:attribute name="presentation-before" type="xs:string" use="optional"/>
|
108
|
+
<xs:attribute name="presentation-after" type="xs:string" use="optional"/>
|
109
|
+
</xs:complexType>
|
110
|
+
|
111
|
+
<xs:simpleType name="EmptyTokenSort">
|
112
|
+
<xs:restriction base="xs:string">
|
113
|
+
<xs:enumeration value="P"/>
|
114
|
+
<xs:enumeration value="C"/>
|
115
|
+
<xs:enumeration value="V"/>
|
116
|
+
</xs:restriction>
|
117
|
+
</xs:simpleType>
|
118
|
+
|
119
|
+
<xs:complexType name="Slash">
|
120
|
+
<xs:attribute name="target-id" type="xs:nonNegativeInteger" use="required"/>
|
121
|
+
<xs:attribute name="relation" type="xs:string" use="required"/>
|
122
|
+
</xs:complexType>
|
123
|
+
|
124
|
+
<xs:complexType name="PartOfSpeechValue">
|
125
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
126
|
+
<xs:attribute name="summary" type="xs:string" use="required"/>
|
127
|
+
</xs:complexType>
|
128
|
+
|
129
|
+
<xs:complexType name="PartsOfSpeech">
|
130
|
+
<xs:sequence>
|
131
|
+
<xs:element name='value' minOccurs="1" maxOccurs="unbounded" type='PartOfSpeechValue'/>
|
132
|
+
</xs:sequence>
|
133
|
+
</xs:complexType>
|
134
|
+
|
135
|
+
<xs:complexType name="InformationStatusValue">
|
136
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
137
|
+
<xs:attribute name="summary" type="xs:string" use="required"/>
|
138
|
+
</xs:complexType>
|
139
|
+
|
140
|
+
<xs:complexType name="InformationStatuses">
|
141
|
+
<xs:sequence>
|
142
|
+
<xs:element name='value' minOccurs="1" maxOccurs="unbounded" type='InformationStatusValue'/>
|
143
|
+
</xs:sequence>
|
144
|
+
</xs:complexType>
|
145
|
+
|
146
|
+
<xs:complexType name="RelationValue">
|
147
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
148
|
+
<xs:attribute name="summary" type="xs:string" use="required"/>
|
149
|
+
<xs:attribute name="primary" type="xs:boolean" use="required"/>
|
150
|
+
<xs:attribute name="secondary" type="xs:boolean" use="required"/>
|
151
|
+
</xs:complexType>
|
152
|
+
|
153
|
+
<xs:complexType name="Relations">
|
154
|
+
<xs:sequence>
|
155
|
+
<xs:element name='value' minOccurs="1" maxOccurs="unbounded" type='RelationValue'/>
|
156
|
+
</xs:sequence>
|
157
|
+
</xs:complexType>
|
158
|
+
|
159
|
+
<xs:complexType name="MorphologyValue">
|
160
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
161
|
+
<xs:attribute name="summary" type="xs:string" use="required"/>
|
162
|
+
</xs:complexType>
|
163
|
+
|
164
|
+
<xs:complexType name="MorphologyField">
|
165
|
+
<xs:sequence>
|
166
|
+
<xs:element name='value' minOccurs="1" maxOccurs="unbounded" type='MorphologyValue'/>
|
167
|
+
</xs:sequence>
|
168
|
+
|
169
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
170
|
+
</xs:complexType>
|
171
|
+
|
172
|
+
<xs:complexType name="Morphology">
|
173
|
+
<xs:sequence>
|
174
|
+
<xs:element name='field' minOccurs="1" maxOccurs="unbounded" type='MorphologyField'/>
|
175
|
+
</xs:sequence>
|
176
|
+
</xs:complexType>
|
177
|
+
|
178
|
+
<xs:complexType name="Annotation">
|
179
|
+
<xs:sequence>
|
180
|
+
<xs:element name='relations' minOccurs="1" maxOccurs="1" type='Relations'/>
|
181
|
+
<xs:element name='parts-of-speech' minOccurs="1" maxOccurs="1" type='PartsOfSpeech'/>
|
182
|
+
<xs:element name='morphology' minOccurs="1" maxOccurs="1" type='Morphology'/>
|
183
|
+
<xs:element name='information-statuses' minOccurs="1" maxOccurs="1" type='InformationStatuses'/>
|
184
|
+
</xs:sequence>
|
185
|
+
</xs:complexType>
|
186
|
+
|
187
|
+
<xs:complexType name="Proiel">
|
188
|
+
<xs:sequence>
|
189
|
+
<xs:element name='annotation' minOccurs="0" maxOccurs="1" type='Annotation'/>
|
190
|
+
<xs:element name='source' minOccurs="1" maxOccurs="unbounded" type='Source'/>
|
191
|
+
</xs:sequence>
|
192
|
+
|
193
|
+
<xs:attribute name='export-time' type="xs:dateTime" use="optional"/>
|
194
|
+
<xs:attribute name="schema-version" type="xs:decimal" use="required" fixed="2.1"/>
|
195
|
+
</xs:complexType>
|
196
|
+
|
197
|
+
<xs:element name='proiel' type='Proiel'/>
|
198
|
+
</xs:schema>
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -20,6 +20,7 @@ module PROIEL
|
|
20
20
|
include SAXMachine
|
21
21
|
|
22
22
|
attribute :id, class: Integer, required: true
|
23
|
+
attribute :'alignment-id', as: :alignment_id, class: Integer, required: false
|
23
24
|
attribute :'head-id', as: :head_id, class: Integer
|
24
25
|
attribute :form
|
25
26
|
attribute :lemma
|
@@ -43,7 +44,12 @@ module PROIEL
|
|
43
44
|
include SAXMachine
|
44
45
|
|
45
46
|
attribute :id, class: Integer, required: true
|
47
|
+
attribute :'alignment-id', as: :alignment_id, class: Integer, required: false
|
46
48
|
attribute :status, class: Symbol, default: :unannotated
|
49
|
+
attribute :'annotated-by', as: :annotated_by, required: false
|
50
|
+
attribute :'reviewed-by', as: :reviewed_by, required: false
|
51
|
+
attribute :'annotated-at', as: :annotated_at, required: false
|
52
|
+
attribute :'reviewed-at', as: :reviewed_at, required: false
|
47
53
|
attribute :'presentation-before', as: :presentation_before
|
48
54
|
attribute :'presentation-after', as: :presentation_after
|
49
55
|
|
@@ -54,7 +60,8 @@ module PROIEL
|
|
54
60
|
class Div
|
55
61
|
include SAXMachine
|
56
62
|
|
57
|
-
attribute :id
|
63
|
+
attribute :id, class: Integer, required: false
|
64
|
+
attribute :'alignment-id', as: :alignment_id, class: Integer, required: false
|
58
65
|
attribute :'presentation-before', as: :presentation_before
|
59
66
|
attribute :'presentation-after', as: :presentation_after
|
60
67
|
|
@@ -67,6 +74,7 @@ module PROIEL
|
|
67
74
|
include SAXMachine
|
68
75
|
|
69
76
|
attribute :id, required: true
|
77
|
+
attribute :'alignment-id', as: :alignment_id, required: false
|
70
78
|
attribute :language, required: true
|
71
79
|
|
72
80
|
element :title
|
@@ -16,7 +16,7 @@ module PROIEL
|
|
16
16
|
# @return [String] schema version number
|
17
17
|
#
|
18
18
|
def self.current_proiel_xml_schema_version
|
19
|
-
'2.
|
19
|
+
'2.1'
|
20
20
|
end
|
21
21
|
|
22
22
|
# Invalid PROIEL XML schema version error.
|
@@ -39,6 +39,8 @@ module PROIEL
|
|
39
39
|
case doc.root.attr('schema-version')
|
40
40
|
when '2.0'
|
41
41
|
'2.0'
|
42
|
+
when '2.1'
|
43
|
+
'2.1'
|
42
44
|
when NilClass
|
43
45
|
'1.0'
|
44
46
|
else
|
@@ -68,7 +70,7 @@ module PROIEL
|
|
68
70
|
# @raise ArgumentError
|
69
71
|
#
|
70
72
|
def self.proiel_xml_schema_filename(schema_version)
|
71
|
-
if schema_version == '1.0' or schema_version == '2.0'
|
73
|
+
if schema_version == '1.0' or schema_version == '2.0' or schema_version == '2.1'
|
72
74
|
File.join(File.dirname(__FILE__),
|
73
75
|
"proiel-#{schema_version}",
|
74
76
|
"proiel-#{schema_version}.xsd")
|
@@ -145,6 +145,16 @@ module PROIEL
|
|
145
145
|
# Pass 3: verify that all features are defined
|
146
146
|
# TBD
|
147
147
|
|
148
|
+
# Pass 4: alignment_id on div, sentence or token requires an alignment_id on source
|
149
|
+
tb.sources.each do |source|
|
150
|
+
if source.alignment_id.nil?
|
151
|
+
if source.divs.any?(&:alignment_id) or source.sentences.any?(&:alignment_id) or source.tokens.any?(&:alignment_id)
|
152
|
+
errors << "Alignment ID(s) on divs, sentences or tokens without alignment ID on source"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Decide if there were any errors
|
148
158
|
if errors.empty?
|
149
159
|
true
|
150
160
|
else
|
data/lib/proiel/sentence.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -23,8 +23,23 @@ module PROIEL
|
|
23
23
|
# @return [nil, String] presentation material after sentence
|
24
24
|
attr_reader :presentation_after
|
25
25
|
|
26
|
+
# @return [nil, Integer] ID of the sentence that this sentence is aligned to
|
27
|
+
attr_reader :alignment_id
|
28
|
+
|
29
|
+
# @return [nil, String] annotator of sentence
|
30
|
+
attr_reader :annotated_by
|
31
|
+
|
32
|
+
# @return [nil, String] reviewer of sentence
|
33
|
+
attr_reader :reviewed_by
|
34
|
+
|
35
|
+
# @return [nil, DateTime] time of annotation
|
36
|
+
attr_reader :annotated_at
|
37
|
+
|
38
|
+
# @return [nil, DateTime] time of reviewed
|
39
|
+
attr_reader :reviewed_at
|
40
|
+
|
26
41
|
# Creates a new sentence object.
|
27
|
-
def initialize(parent, id, status, presentation_before, presentation_after, &block)
|
42
|
+
def initialize(parent, id, status, presentation_before, presentation_after, alignment_id, annotated_by, reviewed_by, annotated_at, reviewed_at, &block)
|
28
43
|
@div = parent
|
29
44
|
|
30
45
|
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
@@ -39,6 +54,21 @@ module PROIEL
|
|
39
54
|
raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
|
40
55
|
@presentation_after = presentation_after.freeze
|
41
56
|
|
57
|
+
raise ArgumentError, 'integer or nil expected' unless alignment_id.nil? or alignment_id.is_a?(Integer)
|
58
|
+
@alignment_id = alignment_id
|
59
|
+
|
60
|
+
raise ArgumentError, 'XML schema date time or nil expected' unless annotated_at.nil? or PROIEL::Utilities.xmlschema_datetime?(annotated_at)
|
61
|
+
@annotated_at = annotated_at ? DateTime.xmlschema(annotated_at).freeze : nil
|
62
|
+
|
63
|
+
raise ArgumentError, 'XML schema date time or nil expected' unless reviewed_at.nil? or PROIEL::Utilities.xmlschema_datetime?(reviewed_at)
|
64
|
+
@reviewed_at = reviewed_at ? DateTime.xmlschema(reviewed_at).freeze : nil
|
65
|
+
|
66
|
+
raise ArgumentError, 'string or nil expected' unless annotated_by.nil? or annotated_by.is_a?(String)
|
67
|
+
@annotated_by = annotated_by.freeze
|
68
|
+
|
69
|
+
raise ArgumentError, 'string or nil expected' unless reviewed_by.nil? or reviewed_by.is_a?(String)
|
70
|
+
@reviewed_by = reviewed_by.freeze
|
71
|
+
|
42
72
|
@children = block.call(self) if block_given?
|
43
73
|
end
|
44
74
|
|
data/lib/proiel/source.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -9,7 +9,7 @@ module PROIEL
|
|
9
9
|
# @return [String] ID of the source
|
10
10
|
attr_reader :id
|
11
11
|
|
12
|
-
# @return [Treebank] treebank that
|
12
|
+
# @return [Treebank] treebank that this source belongs to
|
13
13
|
attr_reader :treebank
|
14
14
|
|
15
15
|
# @return [String] language of the source as an ISO 639-3 language tag
|
@@ -22,13 +22,20 @@ module PROIEL
|
|
22
22
|
# @see PROIEL::Treebank::METADATA_ELEMENTS
|
23
23
|
attr_reader :metadata
|
24
24
|
|
25
|
+
# @return [nil, String] ID of the source that this source is aligned to
|
26
|
+
attr_reader :alignment_id
|
27
|
+
|
25
28
|
# Creates a new source object.
|
26
|
-
def initialize(parent, id, export_time, language, metadata, &block)
|
29
|
+
def initialize(parent, id, export_time, language, metadata, alignment_id, &block)
|
27
30
|
@treebank = parent
|
28
31
|
@id = id.freeze
|
29
32
|
@export_time = DateTime.parse(export_time).freeze
|
30
33
|
@language = language.freeze
|
31
34
|
@metadata = metadata.freeze
|
35
|
+
|
36
|
+
raise ArgumentError, 'string or nil expected' unless alignment_id.nil? or alignment_id.is_a?(String)
|
37
|
+
@alignment_id = alignment_id.freeze
|
38
|
+
|
32
39
|
@children = block.call(self) if block_given?
|
33
40
|
end
|
34
41
|
|
data/lib/proiel/token.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -63,11 +63,15 @@ module PROIEL
|
|
63
63
|
# @return [Array<Array<String,Fixnum>>] secondary edges as an array of pairs of relation tag and target token ID
|
64
64
|
attr_reader :slashes
|
65
65
|
|
66
|
+
# @return [nil, Integer] ID of the sentence that this sentence is aligned to
|
67
|
+
attr_reader :alignment_id
|
68
|
+
|
66
69
|
# Creates a new token object.
|
67
70
|
def initialize(parent, id, head_id, form, lemma, part_of_speech,
|
68
71
|
morphology, relation, empty_token_sort, citation_part,
|
69
72
|
presentation_before, presentation_after, antecedent_id,
|
70
|
-
information_status, contrast_group, foreign_ids, slashes
|
73
|
+
information_status, contrast_group, foreign_ids, slashes,
|
74
|
+
alignment_id)
|
71
75
|
@sentence = parent
|
72
76
|
|
73
77
|
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
@@ -117,6 +121,9 @@ module PROIEL
|
|
117
121
|
|
118
122
|
raise ArgumentError, 'array expected' unless slashes.is_a?(Array)
|
119
123
|
@slashes = slashes.map { |s| [s.relation.freeze, s.target_id] }
|
124
|
+
|
125
|
+
raise ArgumentError, 'integer or nil expected' unless alignment_id.nil? or alignment_id.is_a?(Integer)
|
126
|
+
@alignment_id = alignment_id
|
120
127
|
end
|
121
128
|
|
122
129
|
# @return [Div] parent div object
|
data/lib/proiel/treebank.rb
CHANGED
@@ -86,7 +86,7 @@ module PROIEL
|
|
86
86
|
|
87
87
|
tf.proiel.sources.each do |s|
|
88
88
|
@sources << Source.new(self, s.id, tf.proiel.export_time, s.language,
|
89
|
-
bundle_metadata(s)) do |source|
|
89
|
+
bundle_metadata(s), s.alignment_id) do |source|
|
90
90
|
build_divs(s, source)
|
91
91
|
end
|
92
92
|
|
@@ -165,10 +165,11 @@ module PROIEL
|
|
165
165
|
end
|
166
166
|
|
167
167
|
def build_divs(s, source)
|
168
|
-
#
|
168
|
+
# For PROIEL XML 2.0 we generate an ID, for PROIEL XML >= 2.1 we respect the ID
|
169
|
+
# from the XML file.
|
169
170
|
s.divs.each_with_index.map do |d, i|
|
170
|
-
Div.new(source, i + 1, d.title, d.presentation_before,
|
171
|
-
d.presentation_after) do |div|
|
171
|
+
Div.new(source, d.id || i + 1, d.title, d.presentation_before,
|
172
|
+
d.presentation_after, d.alignment_id) do |div|
|
172
173
|
build_sentences(d, div)
|
173
174
|
end
|
174
175
|
end
|
@@ -177,7 +178,9 @@ module PROIEL
|
|
177
178
|
def build_sentences(d, div)
|
178
179
|
d.sentences.map do |e|
|
179
180
|
Sentence.new(div, e.id, e.status, e.presentation_before,
|
180
|
-
e.presentation_after
|
181
|
+
e.presentation_after, e.alignment_id,
|
182
|
+
e.annotated_by, e.reviewed_by, e.annotated_at,
|
183
|
+
e.reviewed_at) do |sentence|
|
181
184
|
build_tokens(e, sentence)
|
182
185
|
end
|
183
186
|
end
|
@@ -191,7 +194,7 @@ module PROIEL
|
|
191
194
|
t.presentation_before, t.presentation_after,
|
192
195
|
t.antecedent_id, t.information_status,
|
193
196
|
t.contrast_group, t.foreign_ids,
|
194
|
-
t.slashes)
|
197
|
+
t.slashes, t.alignment_id)
|
195
198
|
end
|
196
199
|
end
|
197
200
|
|
data/lib/proiel/utils.rb
ADDED
data/lib/proiel/version.rb
CHANGED
data/lib/proiel.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -12,6 +12,7 @@ require 'memoist'
|
|
12
12
|
require 'nokogiri'
|
13
13
|
|
14
14
|
require 'proiel/version'
|
15
|
+
require 'proiel/utils'
|
15
16
|
require 'proiel/citations'
|
16
17
|
require 'proiel/statistics'
|
17
18
|
require 'proiel/tokenization'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -167,15 +167,11 @@ files:
|
|
167
167
|
- lib/proiel/citations.rb
|
168
168
|
- lib/proiel/div.rb
|
169
169
|
- lib/proiel/positional_tag.rb
|
170
|
-
- lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.md
|
171
170
|
- lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
|
172
171
|
- lib/proiel/proiel_xml/proiel-1.0/teilite.xsd
|
173
172
|
- lib/proiel/proiel_xml/proiel-1.0/xml.xsd
|
174
|
-
- lib/proiel/proiel_xml/proiel-2.0/XMLSchema.xsd
|
175
|
-
- lib/proiel/proiel_xml/proiel-2.0/make
|
176
|
-
- lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.html
|
177
173
|
- lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd
|
178
|
-
- lib/proiel/proiel_xml/proiel-2.
|
174
|
+
- lib/proiel/proiel_xml/proiel-2.1/proiel-2.1.xsd
|
179
175
|
- lib/proiel/proiel_xml/reader.rb
|
180
176
|
- lib/proiel/proiel_xml/schema.rb
|
181
177
|
- lib/proiel/proiel_xml/validator.rb
|
@@ -186,6 +182,7 @@ files:
|
|
186
182
|
- lib/proiel/tokenization.rb
|
187
183
|
- lib/proiel/treebank.rb
|
188
184
|
- lib/proiel/treebank_object.rb
|
185
|
+
- lib/proiel/utils.rb
|
189
186
|
- lib/proiel/version.rb
|
190
187
|
homepage: http://proiel.github.com
|
191
188
|
licenses:
|
@@ -1,16 +0,0 @@
|
|
1
|
-
# The PROIEL XML format
|
2
|
-
|
3
|
-
## `token`
|
4
|
-
|
5
|
-
### `lemma` attribute (string, optional)
|
6
|
-
|
7
|
-
When it is necessary to distinguish lemmas with the same textual form, the
|
8
|
-
PROIEL XML convention is use the associated part of speech to distinguish them.
|
9
|
-
|
10
|
-
If there are multiple lemmas with the same textual form and the same part of
|
11
|
-
speech, the convention is to append `#` and a positive, non-zero integer:
|
12
|
-
|
13
|
-
```
|
14
|
-
quod#1
|
15
|
-
quod#2
|
16
|
-
```
|