proiel-cli 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/proiel +1 -1
- data/lib/proiel/cli/commands/convert.rb +3 -0
- data/lib/proiel/cli/commands/tokenize.rb +11 -10
- data/lib/proiel/cli/commands/words.rb +36 -0
- data/lib/proiel/cli/converters/conll-u.rb +5 -2
- data/lib/proiel/cli/converters/proielxml.rb +20 -3
- data/lib/proiel/cli/version.rb +1 -1
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf5056d98706541003c897f25b470145a26dab43
|
4
|
+
data.tar.gz: 3bc9d72f373deec98554ec4bab9d0482f427a6b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f4a512e9c4f35ccfdd84c4ec1c7d2bed4c5089862802c6171247817b64bb00ba9a3dadf2e82b7ba12bee2b647b86d0faf705e402e34ba6f74c0bdc3a8547796
|
7
|
+
data.tar.gz: c14b7972c896f8f47dbf764b69796c011c9827b715fbf324b2bcca58f82d30acad7f0906d084e3e654299bb5cc22a77547c7fa6a7ef0e6d1c135227cf69701d9
|
data/bin/proiel
CHANGED
@@ -6,7 +6,7 @@ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
|
|
6
6
|
require 'proiel/cli'
|
7
7
|
|
8
8
|
Mercenary.program(:proiel) do |p|
|
9
|
-
p.version PROIEL::VERSION
|
9
|
+
p.version PROIEL::CLI::VERSION
|
10
10
|
p.description 'proiel is a command-line interface for PROIEL treebanks'
|
11
11
|
p.syntax 'proiel <subcommand> [options]'
|
12
12
|
|
@@ -16,6 +16,9 @@ module PROIEL
|
|
16
16
|
f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
|
17
17
|
f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
|
18
18
|
f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
|
19
|
+
f.option 'remove-alignments', '--remove-alignments', 'Remove alignments'
|
20
|
+
f.option 'remove-annotator', '--remove-annotator', 'Remove annotator information'
|
21
|
+
f.option 'remove-reviewer', '--remove-reviewer', 'Remove reviewer information'
|
19
22
|
f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
|
20
23
|
f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
|
21
24
|
end
|
@@ -48,8 +48,9 @@ module PROIEL
|
|
48
48
|
citation_part = nil
|
49
49
|
|
50
50
|
body.each_with_index do |sd_body, i|
|
51
|
-
builder.div
|
52
|
-
sd_body[:
|
51
|
+
builder.div do
|
52
|
+
builder.title sd_body[:title]
|
53
|
+
sd_body[:contents].split(/(@[^ ]+|§[^ ]+ )/).map do |s|
|
53
54
|
if s[0] == '§' or s[0] == '@'
|
54
55
|
s
|
55
56
|
else
|
@@ -57,16 +58,16 @@ module PROIEL
|
|
57
58
|
# sentence-breaking punctuation like periods and question marks, but
|
58
59
|
# after the punctuation mark and characters typically used in pairs,
|
59
60
|
# like brackets and apostrophes.
|
60
|
-
s.gsub(/([\.:;\?!]+[\s†\]\)"']*)/, '\1|')
|
61
|
+
s.gsub(/([\.:;\?!]+[\s†\]\)"']*|\s*[\n\r]+)/, '\1|')
|
61
62
|
end
|
62
63
|
end.join.split('|').each_with_index do |s_body, j|
|
63
|
-
builder.sentence(
|
64
|
+
builder.sentence(status: 'unannotated') do
|
64
65
|
leftover_before = ''
|
65
66
|
|
66
67
|
# Preserve linebreaks in the text.
|
67
|
-
s_body.gsub!(/\s*[\n\r]
|
68
|
+
s_body.gsub!(/\s*[\n\r]+/, "\u2028")
|
68
69
|
|
69
|
-
s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+)([^@§\p{Word}]*)/).each do |(before, form, after)|
|
70
|
+
s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+ )([^@§\p{Word}]*)/).each do |(before, form, after)|
|
70
71
|
case form
|
71
72
|
when /^@(.*)$/
|
72
73
|
leftover_before += before unless before.nil?
|
@@ -74,15 +75,15 @@ module PROIEL
|
|
74
75
|
leftover_before += after unless after.nil?
|
75
76
|
when /^§(.*)$/
|
76
77
|
leftover_before += before unless before.nil?
|
77
|
-
citation_part = $1
|
78
|
+
citation_part = $1.strip
|
78
79
|
leftover_before += after unless after.nil?
|
79
80
|
else
|
80
81
|
before = leftover_before + before
|
81
82
|
leftover_before = ''
|
82
83
|
|
83
|
-
attrs = {
|
84
|
-
attrs[:
|
85
|
-
attrs[:
|
84
|
+
attrs = { :"citation-part" => citation_part, form: form }
|
85
|
+
attrs[:"presentation-before"] = before unless before == ''
|
86
|
+
attrs[:"presentation-after"] = after unless after == ''
|
86
87
|
|
87
88
|
builder.token(attrs)
|
88
89
|
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Words < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:words) do |c|
|
7
|
+
c.syntax 'words [options] filename(s)'
|
8
|
+
c.description 'Extract a word list'
|
9
|
+
|
10
|
+
c.action do |args, options|
|
11
|
+
if args.empty?
|
12
|
+
STDERR.puts 'Missing filename(s). Use --help for more information.'
|
13
|
+
else
|
14
|
+
process(args, options)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def process(args, options)
|
21
|
+
tb = PROIEL::Treebank.new
|
22
|
+
|
23
|
+
args.each do |filename|
|
24
|
+
STDERR.puts "Reading #{filename}...".green if options['verbose']
|
25
|
+
|
26
|
+
tb.load_from_xml(filename)
|
27
|
+
end
|
28
|
+
|
29
|
+
tb.sources.map { |s| s.tokens.map(&:form) }.flatten.sort.uniq.each do |form|
|
30
|
+
STDOUT.puts form
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -57,6 +57,7 @@ module PROIEL
|
|
57
57
|
t.relation,
|
58
58
|
t.empty_token_sort,
|
59
59
|
t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
|
60
|
+
t.citation_part,
|
60
61
|
self
|
61
62
|
)
|
62
63
|
end
|
@@ -164,8 +165,9 @@ module PROIEL
|
|
164
165
|
attr_reader :language
|
165
166
|
attr_reader :empty_token_sort
|
166
167
|
attr_reader :form
|
168
|
+
attr_reader :citation_part
|
167
169
|
|
168
|
-
def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, sentence)
|
170
|
+
def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence)
|
169
171
|
@id = id
|
170
172
|
@head_id = head_id
|
171
173
|
@form = form
|
@@ -178,6 +180,7 @@ module PROIEL
|
|
178
180
|
@slashes = slashes
|
179
181
|
@sentence = sentence
|
180
182
|
@features = (morphology ? map_morphology(morphology) : '' )
|
183
|
+
@citation_part = "ref=" + (citation_part ? citation_part : "").gsub(/\s/, '_')
|
181
184
|
@upos = nil
|
182
185
|
end
|
183
186
|
|
@@ -351,7 +354,7 @@ module PROIEL
|
|
351
354
|
@head_id,
|
352
355
|
(@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
|
353
356
|
'_', # slashes here
|
354
|
-
|
357
|
+
@citation_part].join("\t")
|
355
358
|
end
|
356
359
|
|
357
360
|
def to_s
|
@@ -5,7 +5,7 @@ module PROIEL
|
|
5
5
|
def process(tb, options)
|
6
6
|
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
7
7
|
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
8
|
-
builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.
|
8
|
+
builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
|
9
9
|
builder.annotation do
|
10
10
|
builder.relations do
|
11
11
|
tb.annotation_schema.relation_tags.each do |tag, value|
|
@@ -45,14 +45,24 @@ module PROIEL
|
|
45
45
|
end
|
46
46
|
|
47
47
|
tb.sources.each do |source|
|
48
|
-
|
48
|
+
mandatory_features = %i(id language)
|
49
|
+
optional_features = []
|
50
|
+
optional_features += %i(alignment_id) unless options['remove-alignments']
|
51
|
+
|
52
|
+
builder.source(grab_features(source, mandatory_features, optional_features)) do
|
49
53
|
PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
|
50
54
|
builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
|
51
55
|
end
|
52
56
|
|
53
57
|
source.divs.each do |div|
|
54
58
|
if include_div?(div, options)
|
55
|
-
|
59
|
+
mandatory_features = %i()
|
60
|
+
|
61
|
+
optional_features = []
|
62
|
+
optional_features += %i(presentation_before presentation_after)
|
63
|
+
optional_features += %i(alignment_id) unless options['remove-alignments']
|
64
|
+
|
65
|
+
builder.div(grab_features(div, mandatory_features, optional_features)) do
|
56
66
|
builder.title div.title if div.title
|
57
67
|
|
58
68
|
div.sentences.each do |sentence|
|
@@ -62,6 +72,11 @@ module PROIEL
|
|
62
72
|
optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
|
63
73
|
optional_features += %i(status) unless options['remove-status']
|
64
74
|
optional_features += %i(presentation_before presentation_after)
|
75
|
+
optional_features += %i(alignment_id) unless options['remove-alignments']
|
76
|
+
optional_features += %i(annotated_at) unless options['remove-annotator']
|
77
|
+
optional_features += %i(reviewed_at) unless options['remove-reviewer']
|
78
|
+
optional_features += %i(annotated_by) unless options['remove-annotator']
|
79
|
+
optional_features += %i(reviewed_by) unless options['remove-reviewer']
|
65
80
|
|
66
81
|
builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
|
67
82
|
sentence.tokens.each do |token|
|
@@ -83,6 +98,8 @@ module PROIEL
|
|
83
98
|
mandatory_features << :empty_token_sort
|
84
99
|
end
|
85
100
|
|
101
|
+
optional_features += %i(alignment_id) unless options['remove-alignments']
|
102
|
+
|
86
103
|
attrs = grab_features(token, mandatory_features, optional_features)
|
87
104
|
|
88
105
|
unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
|
data/lib/proiel/cli/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2016-06-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: builder
|
@@ -59,14 +59,14 @@ dependencies:
|
|
59
59
|
requirements:
|
60
60
|
- - "~>"
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version: '1.
|
62
|
+
version: '1.1'
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: '1.
|
69
|
+
version: '1.1'
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
71
|
name: bundler
|
72
72
|
requirement: !ruby/object:Gem::Requirement
|
@@ -204,6 +204,7 @@ files:
|
|
204
204
|
- lib/proiel/cli/commands/info.rb
|
205
205
|
- lib/proiel/cli/commands/tokenize.rb
|
206
206
|
- lib/proiel/cli/commands/validate.rb
|
207
|
+
- lib/proiel/cli/commands/words.rb
|
207
208
|
- lib/proiel/cli/converters/conll-u.rb
|
208
209
|
- lib/proiel/cli/converters/conll-u/morphology.rb
|
209
210
|
- lib/proiel/cli/converters/conll-u/syntax.rb
|
@@ -235,7 +236,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
235
236
|
version: '0'
|
236
237
|
requirements: []
|
237
238
|
rubyforge_project:
|
238
|
-
rubygems_version: 2.
|
239
|
+
rubygems_version: 2.5.1
|
239
240
|
signing_key:
|
240
241
|
specification_version: 4
|
241
242
|
summary: A command-line interface for working with PROIEL treebanks
|