proiel 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/proiel/div.rb +8 -2
- data/lib/proiel/proiel_xml/proiel-2.1/proiel-2.1.xsd +198 -0
- data/lib/proiel/proiel_xml/reader.rb +10 -2
- data/lib/proiel/proiel_xml/schema.rb +4 -2
- data/lib/proiel/proiel_xml/validator.rb +10 -0
- data/lib/proiel/sentence.rb +32 -2
- data/lib/proiel/source.rb +10 -3
- data/lib/proiel/token.rb +9 -2
- data/lib/proiel/treebank.rb +9 -6
- data/lib/proiel/utils.rb +16 -0
- data/lib/proiel/version.rb +1 -1
- data/lib/proiel.rb +2 -1
- metadata +4 -7
- data/lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.md +0 -16
- data/lib/proiel/proiel_xml/proiel-2.0/XMLSchema.xsd +0 -2492
- data/lib/proiel/proiel_xml/proiel-2.0/make +0 -1
- data/lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.html +0 -1928
- data/lib/proiel/proiel_xml/proiel-2.0/xs3p.xsl +0 -8520
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39313c422eb3b2d2f3ad565c0cde3cbd3ddb5271
|
4
|
+
data.tar.gz: 1edadad95bbaad82d4d7ab1a9cc409f8e80d3a74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41bf1b5bcb3c8d8318128ea146b2609d02942d711553876d71c29cafc948312e79e8cd2e448fef751ca25c685c3f0d57a924004a46bdb3496a8f9913772e3e48
|
7
|
+
data.tar.gz: add1511098c62bdd4ee59fdd53e55b4b331595a5a5e02320e97dadff194e8b0b96fb24bd48511a02933d70a83208e0f5d40093a49eec959917177bf59589cbb7
|
data/lib/proiel/div.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -28,8 +28,11 @@ module PROIEL
|
|
28
28
|
# @return [nil, String] presentation material after form
|
29
29
|
attr_reader :presentation_after
|
30
30
|
|
31
|
+
# @return [nil, String] ID of the div that this div is aligned to
|
32
|
+
attr_reader :alignment_id
|
33
|
+
|
31
34
|
# Creates a new div object.
|
32
|
-
def initialize(parent, id, title, presentation_before, presentation_after, &block)
|
35
|
+
def initialize(parent, id, title, presentation_before, presentation_after, alignment_id, &block)
|
33
36
|
@source = parent
|
34
37
|
|
35
38
|
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
@@ -44,6 +47,9 @@ module PROIEL
|
|
44
47
|
raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
|
45
48
|
@presentation_after = presentation_after.freeze
|
46
49
|
|
50
|
+
raise ArgumentError, 'integer or nil expected' unless alignment_id.nil? or alignment_id.is_a?(Integer)
|
51
|
+
@alignment_id = alignment_id
|
52
|
+
|
47
53
|
@children = block.call(self) if block_given?
|
48
54
|
end
|
49
55
|
|
@@ -0,0 +1,198 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
|
3
|
+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
|
4
|
+
<xs:annotation>
|
5
|
+
<xs:documentation>PROIEL XML format version 2.1</xs:documentation>
|
6
|
+
</xs:annotation>
|
7
|
+
|
8
|
+
<xs:complexType name="Source">
|
9
|
+
<xs:sequence>
|
10
|
+
<xs:element name="title" minOccurs="1" maxOccurs="1" type="xs:string"/>
|
11
|
+
<xs:element name="author" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
12
|
+
<xs:element name="citation-part" minOccurs="1" maxOccurs="1" type="xs:string"/>
|
13
|
+
<xs:element name="principal" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
14
|
+
<xs:element name="funder" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
15
|
+
<xs:element name="distributor" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
16
|
+
<xs:element name="distributor-address" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
17
|
+
<xs:element name="address" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
18
|
+
<xs:element name="date" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
19
|
+
<xs:element name="license" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
20
|
+
<xs:element name="license-url" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
21
|
+
<xs:element name="reference-system" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
22
|
+
<xs:element name="editor" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
23
|
+
<xs:element name="editorial-note" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
24
|
+
<xs:element name="annotator" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
25
|
+
<xs:element name="reviewer" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
26
|
+
<xs:element name="electronic-text-editor" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
27
|
+
<xs:element name="electronic-text-title" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
28
|
+
<xs:element name="electronic-text-version" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
29
|
+
<xs:element name="electronic-text-publisher" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
30
|
+
<xs:element name="electronic-text-place" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
31
|
+
<xs:element name="electronic-text-date" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
32
|
+
<xs:element name="electronic-text-original-url" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
33
|
+
<xs:element name="electronic-text-license" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
34
|
+
<xs:element name="electronic-text-license-url" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
35
|
+
<xs:element name="printed-text-editor" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
36
|
+
<xs:element name="printed-text-title" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
37
|
+
<xs:element name="printed-text-edition" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
38
|
+
<xs:element name="printed-text-publisher" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
39
|
+
<xs:element name="printed-text-place" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
40
|
+
<xs:element name="printed-text-date" minOccurs="0" maxOccurs="1" type="xs:string"/>
|
41
|
+
|
42
|
+
<xs:element name="div" minOccurs="1" maxOccurs="unbounded" type="Div"/>
|
43
|
+
</xs:sequence>
|
44
|
+
|
45
|
+
<xs:attribute name="id" type="xs:string" use="required"/>
|
46
|
+
<xs:attribute name="alignment-id" type="xs:string" use="optional"/>
|
47
|
+
<xs:attribute name="language" type="xs:string" use="required"/>
|
48
|
+
</xs:complexType>
|
49
|
+
|
50
|
+
<xs:complexType name="Div">
|
51
|
+
<xs:sequence>
|
52
|
+
<xs:element name="title" minOccurs="1" maxOccurs="1" type="xs:string"/>
|
53
|
+
<xs:element name="sentence" minOccurs="1" maxOccurs="unbounded" type="Sentence"/>
|
54
|
+
</xs:sequence>
|
55
|
+
|
56
|
+
<xs:attribute name="id" type="xs:nonNegativeInteger" use="optional"/>
|
57
|
+
<xs:attribute name="alignment-id" type="xs:nonNegativeInteger" use="optional"/>
|
58
|
+
<xs:attribute name="presentation-before" type="xs:string" use="optional"/>
|
59
|
+
<xs:attribute name="presentation-after" type="xs:string" use="optional"/>
|
60
|
+
</xs:complexType>
|
61
|
+
|
62
|
+
<xs:complexType name="Sentence">
|
63
|
+
<xs:sequence>
|
64
|
+
<xs:element name="token" minOccurs="1" maxOccurs="unbounded" type="Token"/>
|
65
|
+
</xs:sequence>
|
66
|
+
|
67
|
+
<xs:attribute name="id" type="xs:nonNegativeInteger" use="optional"/>
|
68
|
+
<xs:attribute name="alignment-id" type="xs:nonNegativeInteger" use="optional"/>
|
69
|
+
<xs:attribute name="presentation-before" type="xs:string" use="optional"/>
|
70
|
+
<xs:attribute name="presentation-after" type="xs:string" use="optional"/>
|
71
|
+
<xs:attribute name="status" type="SentenceStatus" use="optional"/>
|
72
|
+
<xs:attribute name="annotated-by" type="xs:string" use="optional"/>
|
73
|
+
<xs:attribute name="annotated-at" type="xs:dateTime" use="optional"/>
|
74
|
+
<xs:attribute name="reviewed-by" type="xs:string" use="optional"/>
|
75
|
+
<xs:attribute name="reviewed-at" type="xs:dateTime" use="optional"/>
|
76
|
+
</xs:complexType>
|
77
|
+
|
78
|
+
<xs:simpleType name="SentenceStatus">
|
79
|
+
<xs:restriction base="xs:string">
|
80
|
+
<xs:enumeration value="annotated"/>
|
81
|
+
<xs:enumeration value="reviewed"/>
|
82
|
+
<xs:enumeration value="unannotated"/>
|
83
|
+
</xs:restriction>
|
84
|
+
</xs:simpleType>
|
85
|
+
|
86
|
+
<xs:complexType name="Token">
|
87
|
+
<xs:sequence>
|
88
|
+
<xs:element name="slash" minOccurs="0" maxOccurs="unbounded" type="Slash"/>
|
89
|
+
</xs:sequence>
|
90
|
+
|
91
|
+
<xs:attribute name="id" type="xs:nonNegativeInteger" use="optional"/>
|
92
|
+
<xs:attribute name="alignment-id" type="xs:nonNegativeInteger" use="optional"/>
|
93
|
+
<xs:attribute name="lemma" type="xs:string" use="optional"/>
|
94
|
+
<xs:attribute name="part-of-speech" type="xs:string" use="optional"/>
|
95
|
+
<xs:attribute name="morphology" type="xs:string" use="optional"/>
|
96
|
+
<xs:attribute name="citation-part" type="xs:string" use="optional"/>
|
97
|
+
<xs:attribute name="relation" type="xs:string" use="optional"/>
|
98
|
+
<xs:attribute name="head-id" type="xs:nonNegativeInteger" use="optional"/>
|
99
|
+
<xs:attribute name="information-status" type="xs:string" use="optional"/>
|
100
|
+
<xs:attribute name="antecedent-id" type="xs:nonNegativeInteger" use="optional"/>
|
101
|
+
<xs:attribute name="contrast-group" type="xs:string" use="optional"/>
|
102
|
+
<xs:attribute name="foreign-ids" type="xs:string" use="optional"/>
|
103
|
+
|
104
|
+
<!-- XSD does not allow us to constrain the use of these attributes properly so they are marked optional even though their occurrence depends on the value of empty-token-sort and form. -->
|
105
|
+
<xs:attribute name="empty-token-sort" type="EmptyTokenSort" use="optional"/>
|
106
|
+
<xs:attribute name="form" type="xs:string" use="optional"/>
|
107
|
+
<xs:attribute name="presentation-before" type="xs:string" use="optional"/>
|
108
|
+
<xs:attribute name="presentation-after" type="xs:string" use="optional"/>
|
109
|
+
</xs:complexType>
|
110
|
+
|
111
|
+
<xs:simpleType name="EmptyTokenSort">
|
112
|
+
<xs:restriction base="xs:string">
|
113
|
+
<xs:enumeration value="P"/>
|
114
|
+
<xs:enumeration value="C"/>
|
115
|
+
<xs:enumeration value="V"/>
|
116
|
+
</xs:restriction>
|
117
|
+
</xs:simpleType>
|
118
|
+
|
119
|
+
<xs:complexType name="Slash">
|
120
|
+
<xs:attribute name="target-id" type="xs:nonNegativeInteger" use="required"/>
|
121
|
+
<xs:attribute name="relation" type="xs:string" use="required"/>
|
122
|
+
</xs:complexType>
|
123
|
+
|
124
|
+
<xs:complexType name="PartOfSpeechValue">
|
125
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
126
|
+
<xs:attribute name="summary" type="xs:string" use="required"/>
|
127
|
+
</xs:complexType>
|
128
|
+
|
129
|
+
<xs:complexType name="PartsOfSpeech">
|
130
|
+
<xs:sequence>
|
131
|
+
<xs:element name='value' minOccurs="1" maxOccurs="unbounded" type='PartOfSpeechValue'/>
|
132
|
+
</xs:sequence>
|
133
|
+
</xs:complexType>
|
134
|
+
|
135
|
+
<xs:complexType name="InformationStatusValue">
|
136
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
137
|
+
<xs:attribute name="summary" type="xs:string" use="required"/>
|
138
|
+
</xs:complexType>
|
139
|
+
|
140
|
+
<xs:complexType name="InformationStatuses">
|
141
|
+
<xs:sequence>
|
142
|
+
<xs:element name='value' minOccurs="1" maxOccurs="unbounded" type='InformationStatusValue'/>
|
143
|
+
</xs:sequence>
|
144
|
+
</xs:complexType>
|
145
|
+
|
146
|
+
<xs:complexType name="RelationValue">
|
147
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
148
|
+
<xs:attribute name="summary" type="xs:string" use="required"/>
|
149
|
+
<xs:attribute name="primary" type="xs:boolean" use="required"/>
|
150
|
+
<xs:attribute name="secondary" type="xs:boolean" use="required"/>
|
151
|
+
</xs:complexType>
|
152
|
+
|
153
|
+
<xs:complexType name="Relations">
|
154
|
+
<xs:sequence>
|
155
|
+
<xs:element name='value' minOccurs="1" maxOccurs="unbounded" type='RelationValue'/>
|
156
|
+
</xs:sequence>
|
157
|
+
</xs:complexType>
|
158
|
+
|
159
|
+
<xs:complexType name="MorphologyValue">
|
160
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
161
|
+
<xs:attribute name="summary" type="xs:string" use="required"/>
|
162
|
+
</xs:complexType>
|
163
|
+
|
164
|
+
<xs:complexType name="MorphologyField">
|
165
|
+
<xs:sequence>
|
166
|
+
<xs:element name='value' minOccurs="1" maxOccurs="unbounded" type='MorphologyValue'/>
|
167
|
+
</xs:sequence>
|
168
|
+
|
169
|
+
<xs:attribute name="tag" type="xs:string" use="required"/>
|
170
|
+
</xs:complexType>
|
171
|
+
|
172
|
+
<xs:complexType name="Morphology">
|
173
|
+
<xs:sequence>
|
174
|
+
<xs:element name='field' minOccurs="1" maxOccurs="unbounded" type='MorphologyField'/>
|
175
|
+
</xs:sequence>
|
176
|
+
</xs:complexType>
|
177
|
+
|
178
|
+
<xs:complexType name="Annotation">
|
179
|
+
<xs:sequence>
|
180
|
+
<xs:element name='relations' minOccurs="1" maxOccurs="1" type='Relations'/>
|
181
|
+
<xs:element name='parts-of-speech' minOccurs="1" maxOccurs="1" type='PartsOfSpeech'/>
|
182
|
+
<xs:element name='morphology' minOccurs="1" maxOccurs="1" type='Morphology'/>
|
183
|
+
<xs:element name='information-statuses' minOccurs="1" maxOccurs="1" type='InformationStatuses'/>
|
184
|
+
</xs:sequence>
|
185
|
+
</xs:complexType>
|
186
|
+
|
187
|
+
<xs:complexType name="Proiel">
|
188
|
+
<xs:sequence>
|
189
|
+
<xs:element name='annotation' minOccurs="0" maxOccurs="1" type='Annotation'/>
|
190
|
+
<xs:element name='source' minOccurs="1" maxOccurs="unbounded" type='Source'/>
|
191
|
+
</xs:sequence>
|
192
|
+
|
193
|
+
<xs:attribute name='export-time' type="xs:dateTime" use="optional"/>
|
194
|
+
<xs:attribute name="schema-version" type="xs:decimal" use="required" fixed="2.1"/>
|
195
|
+
</xs:complexType>
|
196
|
+
|
197
|
+
<xs:element name='proiel' type='Proiel'/>
|
198
|
+
</xs:schema>
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -20,6 +20,7 @@ module PROIEL
|
|
20
20
|
include SAXMachine
|
21
21
|
|
22
22
|
attribute :id, class: Integer, required: true
|
23
|
+
attribute :'alignment-id', as: :alignment_id, class: Integer, required: false
|
23
24
|
attribute :'head-id', as: :head_id, class: Integer
|
24
25
|
attribute :form
|
25
26
|
attribute :lemma
|
@@ -43,7 +44,12 @@ module PROIEL
|
|
43
44
|
include SAXMachine
|
44
45
|
|
45
46
|
attribute :id, class: Integer, required: true
|
47
|
+
attribute :'alignment-id', as: :alignment_id, class: Integer, required: false
|
46
48
|
attribute :status, class: Symbol, default: :unannotated
|
49
|
+
attribute :'annotated-by', as: :annotated_by, required: false
|
50
|
+
attribute :'reviewed-by', as: :reviewed_by, required: false
|
51
|
+
attribute :'annotated-at', as: :annotated_at, required: false
|
52
|
+
attribute :'reviewed-at', as: :reviewed_at, required: false
|
47
53
|
attribute :'presentation-before', as: :presentation_before
|
48
54
|
attribute :'presentation-after', as: :presentation_after
|
49
55
|
|
@@ -54,7 +60,8 @@ module PROIEL
|
|
54
60
|
class Div
|
55
61
|
include SAXMachine
|
56
62
|
|
57
|
-
attribute :id
|
63
|
+
attribute :id, class: Integer, required: false
|
64
|
+
attribute :'alignment-id', as: :alignment_id, class: Integer, required: false
|
58
65
|
attribute :'presentation-before', as: :presentation_before
|
59
66
|
attribute :'presentation-after', as: :presentation_after
|
60
67
|
|
@@ -67,6 +74,7 @@ module PROIEL
|
|
67
74
|
include SAXMachine
|
68
75
|
|
69
76
|
attribute :id, required: true
|
77
|
+
attribute :'alignment-id', as: :alignment_id, required: false
|
70
78
|
attribute :language, required: true
|
71
79
|
|
72
80
|
element :title
|
@@ -16,7 +16,7 @@ module PROIEL
|
|
16
16
|
# @return [String] schema version number
|
17
17
|
#
|
18
18
|
def self.current_proiel_xml_schema_version
|
19
|
-
'2.
|
19
|
+
'2.1'
|
20
20
|
end
|
21
21
|
|
22
22
|
# Invalid PROIEL XML schema version error.
|
@@ -39,6 +39,8 @@ module PROIEL
|
|
39
39
|
case doc.root.attr('schema-version')
|
40
40
|
when '2.0'
|
41
41
|
'2.0'
|
42
|
+
when '2.1'
|
43
|
+
'2.1'
|
42
44
|
when NilClass
|
43
45
|
'1.0'
|
44
46
|
else
|
@@ -68,7 +70,7 @@ module PROIEL
|
|
68
70
|
# @raise ArgumentError
|
69
71
|
#
|
70
72
|
def self.proiel_xml_schema_filename(schema_version)
|
71
|
-
if schema_version == '1.0' or schema_version == '2.0'
|
73
|
+
if schema_version == '1.0' or schema_version == '2.0' or schema_version == '2.1'
|
72
74
|
File.join(File.dirname(__FILE__),
|
73
75
|
"proiel-#{schema_version}",
|
74
76
|
"proiel-#{schema_version}.xsd")
|
@@ -145,6 +145,16 @@ module PROIEL
|
|
145
145
|
# Pass 3: verify that all features are defined
|
146
146
|
# TBD
|
147
147
|
|
148
|
+
# Pass 4: alignment_id on div, sentence or token requires an alignment_id on source
|
149
|
+
tb.sources.each do |source|
|
150
|
+
if source.alignment_id.nil?
|
151
|
+
if source.divs.any?(&:alignment_id) or source.sentences.any?(&:alignment_id) or source.tokens.any?(&:alignment_id)
|
152
|
+
errors << "Alignment ID(s) on divs, sentences or tokens without alignment ID on source"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Decide if there were any errors
|
148
158
|
if errors.empty?
|
149
159
|
true
|
150
160
|
else
|
data/lib/proiel/sentence.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -23,8 +23,23 @@ module PROIEL
|
|
23
23
|
# @return [nil, String] presentation material after sentence
|
24
24
|
attr_reader :presentation_after
|
25
25
|
|
26
|
+
# @return [nil, Integer] ID of the sentence that this sentence is aligned to
|
27
|
+
attr_reader :alignment_id
|
28
|
+
|
29
|
+
# @return [nil, String] annotator of sentence
|
30
|
+
attr_reader :annotated_by
|
31
|
+
|
32
|
+
# @return [nil, String] reviewer of sentence
|
33
|
+
attr_reader :reviewed_by
|
34
|
+
|
35
|
+
# @return [nil, DateTime] time of annotation
|
36
|
+
attr_reader :annotated_at
|
37
|
+
|
38
|
+
# @return [nil, DateTime] time of reviewed
|
39
|
+
attr_reader :reviewed_at
|
40
|
+
|
26
41
|
# Creates a new sentence object.
|
27
|
-
def initialize(parent, id, status, presentation_before, presentation_after, &block)
|
42
|
+
def initialize(parent, id, status, presentation_before, presentation_after, alignment_id, annotated_by, reviewed_by, annotated_at, reviewed_at, &block)
|
28
43
|
@div = parent
|
29
44
|
|
30
45
|
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
@@ -39,6 +54,21 @@ module PROIEL
|
|
39
54
|
raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
|
40
55
|
@presentation_after = presentation_after.freeze
|
41
56
|
|
57
|
+
raise ArgumentError, 'integer or nil expected' unless alignment_id.nil? or alignment_id.is_a?(Integer)
|
58
|
+
@alignment_id = alignment_id
|
59
|
+
|
60
|
+
raise ArgumentError, 'XML schema date time or nil expected' unless annotated_at.nil? or PROIEL::Utilities.xmlschema_datetime?(annotated_at)
|
61
|
+
@annotated_at = annotated_at ? DateTime.xmlschema(annotated_at).freeze : nil
|
62
|
+
|
63
|
+
raise ArgumentError, 'XML schema date time or nil expected' unless reviewed_at.nil? or PROIEL::Utilities.xmlschema_datetime?(reviewed_at)
|
64
|
+
@reviewed_at = reviewed_at ? DateTime.xmlschema(reviewed_at).freeze : nil
|
65
|
+
|
66
|
+
raise ArgumentError, 'string or nil expected' unless annotated_by.nil? or annotated_by.is_a?(String)
|
67
|
+
@annotated_by = annotated_by.freeze
|
68
|
+
|
69
|
+
raise ArgumentError, 'string or nil expected' unless reviewed_by.nil? or reviewed_by.is_a?(String)
|
70
|
+
@reviewed_by = reviewed_by.freeze
|
71
|
+
|
42
72
|
@children = block.call(self) if block_given?
|
43
73
|
end
|
44
74
|
|
data/lib/proiel/source.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -9,7 +9,7 @@ module PROIEL
|
|
9
9
|
# @return [String] ID of the source
|
10
10
|
attr_reader :id
|
11
11
|
|
12
|
-
# @return [Treebank] treebank that
|
12
|
+
# @return [Treebank] treebank that this source belongs to
|
13
13
|
attr_reader :treebank
|
14
14
|
|
15
15
|
# @return [String] language of the source as an ISO 639-3 language tag
|
@@ -22,13 +22,20 @@ module PROIEL
|
|
22
22
|
# @see PROIEL::Treebank::METADATA_ELEMENTS
|
23
23
|
attr_reader :metadata
|
24
24
|
|
25
|
+
# @return [nil, String] ID of the source that this source is aligned to
|
26
|
+
attr_reader :alignment_id
|
27
|
+
|
25
28
|
# Creates a new source object.
|
26
|
-
def initialize(parent, id, export_time, language, metadata, &block)
|
29
|
+
def initialize(parent, id, export_time, language, metadata, alignment_id, &block)
|
27
30
|
@treebank = parent
|
28
31
|
@id = id.freeze
|
29
32
|
@export_time = DateTime.parse(export_time).freeze
|
30
33
|
@language = language.freeze
|
31
34
|
@metadata = metadata.freeze
|
35
|
+
|
36
|
+
raise ArgumentError, 'string or nil expected' unless alignment_id.nil? or alignment_id.is_a?(String)
|
37
|
+
@alignment_id = alignment_id.freeze
|
38
|
+
|
32
39
|
@children = block.call(self) if block_given?
|
33
40
|
end
|
34
41
|
|
data/lib/proiel/token.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -63,11 +63,15 @@ module PROIEL
|
|
63
63
|
# @return [Array<Array<String,Fixnum>>] secondary edges as an array of pairs of relation tag and target token ID
|
64
64
|
attr_reader :slashes
|
65
65
|
|
66
|
+
# @return [nil, Integer] ID of the sentence that this sentence is aligned to
|
67
|
+
attr_reader :alignment_id
|
68
|
+
|
66
69
|
# Creates a new token object.
|
67
70
|
def initialize(parent, id, head_id, form, lemma, part_of_speech,
|
68
71
|
morphology, relation, empty_token_sort, citation_part,
|
69
72
|
presentation_before, presentation_after, antecedent_id,
|
70
|
-
information_status, contrast_group, foreign_ids, slashes
|
73
|
+
information_status, contrast_group, foreign_ids, slashes,
|
74
|
+
alignment_id)
|
71
75
|
@sentence = parent
|
72
76
|
|
73
77
|
raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
|
@@ -117,6 +121,9 @@ module PROIEL
|
|
117
121
|
|
118
122
|
raise ArgumentError, 'array expected' unless slashes.is_a?(Array)
|
119
123
|
@slashes = slashes.map { |s| [s.relation.freeze, s.target_id] }
|
124
|
+
|
125
|
+
raise ArgumentError, 'integer or nil expected' unless alignment_id.nil? or alignment_id.is_a?(Integer)
|
126
|
+
@alignment_id = alignment_id
|
120
127
|
end
|
121
128
|
|
122
129
|
# @return [Div] parent div object
|
data/lib/proiel/treebank.rb
CHANGED
@@ -86,7 +86,7 @@ module PROIEL
|
|
86
86
|
|
87
87
|
tf.proiel.sources.each do |s|
|
88
88
|
@sources << Source.new(self, s.id, tf.proiel.export_time, s.language,
|
89
|
-
bundle_metadata(s)) do |source|
|
89
|
+
bundle_metadata(s), s.alignment_id) do |source|
|
90
90
|
build_divs(s, source)
|
91
91
|
end
|
92
92
|
|
@@ -165,10 +165,11 @@ module PROIEL
|
|
165
165
|
end
|
166
166
|
|
167
167
|
def build_divs(s, source)
|
168
|
-
#
|
168
|
+
# For PROIEL XML 2.0 we generate an ID, for PROIEL XML >= 2.1 we respect the ID
|
169
|
+
# from the XML file.
|
169
170
|
s.divs.each_with_index.map do |d, i|
|
170
|
-
Div.new(source, i + 1, d.title, d.presentation_before,
|
171
|
-
d.presentation_after) do |div|
|
171
|
+
Div.new(source, d.id || i + 1, d.title, d.presentation_before,
|
172
|
+
d.presentation_after, d.alignment_id) do |div|
|
172
173
|
build_sentences(d, div)
|
173
174
|
end
|
174
175
|
end
|
@@ -177,7 +178,9 @@ module PROIEL
|
|
177
178
|
def build_sentences(d, div)
|
178
179
|
d.sentences.map do |e|
|
179
180
|
Sentence.new(div, e.id, e.status, e.presentation_before,
|
180
|
-
e.presentation_after
|
181
|
+
e.presentation_after, e.alignment_id,
|
182
|
+
e.annotated_by, e.reviewed_by, e.annotated_at,
|
183
|
+
e.reviewed_at) do |sentence|
|
181
184
|
build_tokens(e, sentence)
|
182
185
|
end
|
183
186
|
end
|
@@ -191,7 +194,7 @@ module PROIEL
|
|
191
194
|
t.presentation_before, t.presentation_after,
|
192
195
|
t.antecedent_id, t.information_status,
|
193
196
|
t.contrast_group, t.foreign_ids,
|
194
|
-
t.slashes)
|
197
|
+
t.slashes, t.alignment_id)
|
195
198
|
end
|
196
199
|
end
|
197
200
|
|
data/lib/proiel/utils.rb
ADDED
data/lib/proiel/version.rb
CHANGED
data/lib/proiel.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2016 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -12,6 +12,7 @@ require 'memoist'
|
|
12
12
|
require 'nokogiri'
|
13
13
|
|
14
14
|
require 'proiel/version'
|
15
|
+
require 'proiel/utils'
|
15
16
|
require 'proiel/citations'
|
16
17
|
require 'proiel/statistics'
|
17
18
|
require 'proiel/tokenization'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -167,15 +167,11 @@ files:
|
|
167
167
|
- lib/proiel/citations.rb
|
168
168
|
- lib/proiel/div.rb
|
169
169
|
- lib/proiel/positional_tag.rb
|
170
|
-
- lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.md
|
171
170
|
- lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
|
172
171
|
- lib/proiel/proiel_xml/proiel-1.0/teilite.xsd
|
173
172
|
- lib/proiel/proiel_xml/proiel-1.0/xml.xsd
|
174
|
-
- lib/proiel/proiel_xml/proiel-2.0/XMLSchema.xsd
|
175
|
-
- lib/proiel/proiel_xml/proiel-2.0/make
|
176
|
-
- lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.html
|
177
173
|
- lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd
|
178
|
-
- lib/proiel/proiel_xml/proiel-2.
|
174
|
+
- lib/proiel/proiel_xml/proiel-2.1/proiel-2.1.xsd
|
179
175
|
- lib/proiel/proiel_xml/reader.rb
|
180
176
|
- lib/proiel/proiel_xml/schema.rb
|
181
177
|
- lib/proiel/proiel_xml/validator.rb
|
@@ -186,6 +182,7 @@ files:
|
|
186
182
|
- lib/proiel/tokenization.rb
|
187
183
|
- lib/proiel/treebank.rb
|
188
184
|
- lib/proiel/treebank_object.rb
|
185
|
+
- lib/proiel/utils.rb
|
189
186
|
- lib/proiel/version.rb
|
190
187
|
homepage: http://proiel.github.com
|
191
188
|
licenses:
|
@@ -1,16 +0,0 @@
|
|
1
|
-
# The PROIEL XML format
|
2
|
-
|
3
|
-
## `token`
|
4
|
-
|
5
|
-
### `lemma` attribute (string, optional)
|
6
|
-
|
7
|
-
When it is necessary to distinguish lemmas with the same textual form, the
|
8
|
-
PROIEL XML convention is use the associated part of speech to distinguish them.
|
9
|
-
|
10
|
-
If there are multiple lemmas with the same textual form and the same part of
|
11
|
-
speech, the convention is to append `#` and a positive, non-zero integer:
|
12
|
-
|
13
|
-
```
|
14
|
-
quod#1
|
15
|
-
quod#2
|
16
|
-
```
|