proiel-cli 0.1.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/proiel +1 -1
- data/lib/proiel/cli/commands/convert.rb +3 -0
- data/lib/proiel/cli/commands/tokenize.rb +11 -10
- data/lib/proiel/cli/commands/words.rb +36 -0
- data/lib/proiel/cli/converters/conll-u.rb +5 -2
- data/lib/proiel/cli/converters/proielxml.rb +20 -3
- data/lib/proiel/cli/version.rb +1 -1
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf5056d98706541003c897f25b470145a26dab43
|
4
|
+
data.tar.gz: 3bc9d72f373deec98554ec4bab9d0482f427a6b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f4a512e9c4f35ccfdd84c4ec1c7d2bed4c5089862802c6171247817b64bb00ba9a3dadf2e82b7ba12bee2b647b86d0faf705e402e34ba6f74c0bdc3a8547796
|
7
|
+
data.tar.gz: c14b7972c896f8f47dbf764b69796c011c9827b715fbf324b2bcca58f82d30acad7f0906d084e3e654299bb5cc22a77547c7fa6a7ef0e6d1c135227cf69701d9
|
data/bin/proiel
CHANGED
@@ -6,7 +6,7 @@ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
|
|
6
6
|
require 'proiel/cli'
|
7
7
|
|
8
8
|
Mercenary.program(:proiel) do |p|
|
9
|
-
p.version PROIEL::VERSION
|
9
|
+
p.version PROIEL::CLI::VERSION
|
10
10
|
p.description 'proiel is a command-line interface for PROIEL treebanks'
|
11
11
|
p.syntax 'proiel <subcommand> [options]'
|
12
12
|
|
@@ -16,6 +16,9 @@ module PROIEL
|
|
16
16
|
f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
|
17
17
|
f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
|
18
18
|
f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
|
19
|
+
f.option 'remove-alignments', '--remove-alignments', 'Remove alignments'
|
20
|
+
f.option 'remove-annotator', '--remove-annotator', 'Remove annotator information'
|
21
|
+
f.option 'remove-reviewer', '--remove-reviewer', 'Remove reviewer information'
|
19
22
|
f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
|
20
23
|
f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
|
21
24
|
end
|
@@ -48,8 +48,9 @@ module PROIEL
|
|
48
48
|
citation_part = nil
|
49
49
|
|
50
50
|
body.each_with_index do |sd_body, i|
|
51
|
-
builder.div
|
52
|
-
sd_body[:
|
51
|
+
builder.div do
|
52
|
+
builder.title sd_body[:title]
|
53
|
+
sd_body[:contents].split(/(@[^ ]+|§[^ ]+ )/).map do |s|
|
53
54
|
if s[0] == '§' or s[0] == '@'
|
54
55
|
s
|
55
56
|
else
|
@@ -57,16 +58,16 @@ module PROIEL
|
|
57
58
|
# sentence-breaking punctuation like periods and question marks, but
|
58
59
|
# after the punctuation mark and characters typically used in pairs,
|
59
60
|
# like brackets and apostrophes.
|
60
|
-
s.gsub(/([\.:;\?!]+[\s†\]\)"']*)/, '\1|')
|
61
|
+
s.gsub(/([\.:;\?!]+[\s†\]\)"']*|\s*[\n\r]+)/, '\1|')
|
61
62
|
end
|
62
63
|
end.join.split('|').each_with_index do |s_body, j|
|
63
|
-
builder.sentence(
|
64
|
+
builder.sentence(status: 'unannotated') do
|
64
65
|
leftover_before = ''
|
65
66
|
|
66
67
|
# Preserve linebreaks in the text.
|
67
|
-
s_body.gsub!(/\s*[\n\r]
|
68
|
+
s_body.gsub!(/\s*[\n\r]+/, "\u2028")
|
68
69
|
|
69
|
-
s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+)([^@§\p{Word}]*)/).each do |(before, form, after)|
|
70
|
+
s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+ )([^@§\p{Word}]*)/).each do |(before, form, after)|
|
70
71
|
case form
|
71
72
|
when /^@(.*)$/
|
72
73
|
leftover_before += before unless before.nil?
|
@@ -74,15 +75,15 @@ module PROIEL
|
|
74
75
|
leftover_before += after unless after.nil?
|
75
76
|
when /^§(.*)$/
|
76
77
|
leftover_before += before unless before.nil?
|
77
|
-
citation_part = $1
|
78
|
+
citation_part = $1.strip
|
78
79
|
leftover_before += after unless after.nil?
|
79
80
|
else
|
80
81
|
before = leftover_before + before
|
81
82
|
leftover_before = ''
|
82
83
|
|
83
|
-
attrs = {
|
84
|
-
attrs[:
|
85
|
-
attrs[:
|
84
|
+
attrs = { :"citation-part" => citation_part, form: form }
|
85
|
+
attrs[:"presentation-before"] = before unless before == ''
|
86
|
+
attrs[:"presentation-after"] = after unless after == ''
|
86
87
|
|
87
88
|
builder.token(attrs)
|
88
89
|
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Words < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:words) do |c|
|
7
|
+
c.syntax 'words [options] filename(s)'
|
8
|
+
c.description 'Extract a word list'
|
9
|
+
|
10
|
+
c.action do |args, options|
|
11
|
+
if args.empty?
|
12
|
+
STDERR.puts 'Missing filename(s). Use --help for more information.'
|
13
|
+
else
|
14
|
+
process(args, options)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def process(args, options)
|
21
|
+
tb = PROIEL::Treebank.new
|
22
|
+
|
23
|
+
args.each do |filename|
|
24
|
+
STDERR.puts "Reading #{filename}...".green if options['verbose']
|
25
|
+
|
26
|
+
tb.load_from_xml(filename)
|
27
|
+
end
|
28
|
+
|
29
|
+
tb.sources.map { |s| s.tokens.map(&:form) }.flatten.sort.uniq.each do |form|
|
30
|
+
STDOUT.puts form
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -57,6 +57,7 @@ module PROIEL
|
|
57
57
|
t.relation,
|
58
58
|
t.empty_token_sort,
|
59
59
|
t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
|
60
|
+
t.citation_part,
|
60
61
|
self
|
61
62
|
)
|
62
63
|
end
|
@@ -164,8 +165,9 @@ module PROIEL
|
|
164
165
|
attr_reader :language
|
165
166
|
attr_reader :empty_token_sort
|
166
167
|
attr_reader :form
|
168
|
+
attr_reader :citation_part
|
167
169
|
|
168
|
-
def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, sentence)
|
170
|
+
def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence)
|
169
171
|
@id = id
|
170
172
|
@head_id = head_id
|
171
173
|
@form = form
|
@@ -178,6 +180,7 @@ module PROIEL
|
|
178
180
|
@slashes = slashes
|
179
181
|
@sentence = sentence
|
180
182
|
@features = (morphology ? map_morphology(morphology) : '' )
|
183
|
+
@citation_part = "ref=" + (citation_part ? citation_part : "").gsub(/\s/, '_')
|
181
184
|
@upos = nil
|
182
185
|
end
|
183
186
|
|
@@ -351,7 +354,7 @@ module PROIEL
|
|
351
354
|
@head_id,
|
352
355
|
(@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
|
353
356
|
'_', # slashes here
|
354
|
-
|
357
|
+
@citation_part].join("\t")
|
355
358
|
end
|
356
359
|
|
357
360
|
def to_s
|
@@ -5,7 +5,7 @@ module PROIEL
|
|
5
5
|
def process(tb, options)
|
6
6
|
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
7
7
|
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
8
|
-
builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.
|
8
|
+
builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
|
9
9
|
builder.annotation do
|
10
10
|
builder.relations do
|
11
11
|
tb.annotation_schema.relation_tags.each do |tag, value|
|
@@ -45,14 +45,24 @@ module PROIEL
|
|
45
45
|
end
|
46
46
|
|
47
47
|
tb.sources.each do |source|
|
48
|
-
|
48
|
+
mandatory_features = %i(id language)
|
49
|
+
optional_features = []
|
50
|
+
optional_features += %i(alignment_id) unless options['remove-alignments']
|
51
|
+
|
52
|
+
builder.source(grab_features(source, mandatory_features, optional_features)) do
|
49
53
|
PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
|
50
54
|
builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
|
51
55
|
end
|
52
56
|
|
53
57
|
source.divs.each do |div|
|
54
58
|
if include_div?(div, options)
|
55
|
-
|
59
|
+
mandatory_features = %i()
|
60
|
+
|
61
|
+
optional_features = []
|
62
|
+
optional_features += %i(presentation_before presentation_after)
|
63
|
+
optional_features += %i(alignment_id) unless options['remove-alignments']
|
64
|
+
|
65
|
+
builder.div(grab_features(div, mandatory_features, optional_features)) do
|
56
66
|
builder.title div.title if div.title
|
57
67
|
|
58
68
|
div.sentences.each do |sentence|
|
@@ -62,6 +72,11 @@ module PROIEL
|
|
62
72
|
optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
|
63
73
|
optional_features += %i(status) unless options['remove-status']
|
64
74
|
optional_features += %i(presentation_before presentation_after)
|
75
|
+
optional_features += %i(alignment_id) unless options['remove-alignments']
|
76
|
+
optional_features += %i(annotated_at) unless options['remove-annotator']
|
77
|
+
optional_features += %i(reviewed_at) unless options['remove-reviewer']
|
78
|
+
optional_features += %i(annotated_by) unless options['remove-annotator']
|
79
|
+
optional_features += %i(reviewed_by) unless options['remove-reviewer']
|
65
80
|
|
66
81
|
builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
|
67
82
|
sentence.tokens.each do |token|
|
@@ -83,6 +98,8 @@ module PROIEL
|
|
83
98
|
mandatory_features << :empty_token_sort
|
84
99
|
end
|
85
100
|
|
101
|
+
optional_features += %i(alignment_id) unless options['remove-alignments']
|
102
|
+
|
86
103
|
attrs = grab_features(token, mandatory_features, optional_features)
|
87
104
|
|
88
105
|
unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
|
data/lib/proiel/cli/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2016-06-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: builder
|
@@ -59,14 +59,14 @@ dependencies:
|
|
59
59
|
requirements:
|
60
60
|
- - "~>"
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version: '1.
|
62
|
+
version: '1.1'
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: '1.
|
69
|
+
version: '1.1'
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
71
|
name: bundler
|
72
72
|
requirement: !ruby/object:Gem::Requirement
|
@@ -204,6 +204,7 @@ files:
|
|
204
204
|
- lib/proiel/cli/commands/info.rb
|
205
205
|
- lib/proiel/cli/commands/tokenize.rb
|
206
206
|
- lib/proiel/cli/commands/validate.rb
|
207
|
+
- lib/proiel/cli/commands/words.rb
|
207
208
|
- lib/proiel/cli/converters/conll-u.rb
|
208
209
|
- lib/proiel/cli/converters/conll-u/morphology.rb
|
209
210
|
- lib/proiel/cli/converters/conll-u/syntax.rb
|
@@ -235,7 +236,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
235
236
|
version: '0'
|
236
237
|
requirements: []
|
237
238
|
rubyforge_project:
|
238
|
-
rubygems_version: 2.
|
239
|
+
rubygems_version: 2.5.1
|
239
240
|
signing_key:
|
240
241
|
specification_version: 4
|
241
242
|
summary: A command-line interface for working with PROIEL treebanks
|