proiel-cli 0.1.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bfb4db333ae791d4171490c7b3d316364b205729
4
- data.tar.gz: fd001265bdcc8e75e49fc1e2a188f7c63471bb62
3
+ metadata.gz: cf5056d98706541003c897f25b470145a26dab43
4
+ data.tar.gz: 3bc9d72f373deec98554ec4bab9d0482f427a6b6
5
5
  SHA512:
6
- metadata.gz: 29cb26e28f22486db097b982d7e2cd4783f2b38d8d6c13fb06309fb25377791cd8248c88627b7d5b72c3adfe0e958c9696b2508fcb975e9a8fdf516a9825963b
7
- data.tar.gz: b565292467456b10415039f0464bdfafcae5fecf55747dd3b9ec2473313ba08789d49431ecb06d649596bfd98031640481efccd9b5be58b6b565d13dcc3a1161
6
+ metadata.gz: 6f4a512e9c4f35ccfdd84c4ec1c7d2bed4c5089862802c6171247817b64bb00ba9a3dadf2e82b7ba12bee2b647b86d0faf705e402e34ba6f74c0bdc3a8547796
7
+ data.tar.gz: c14b7972c896f8f47dbf764b69796c011c9827b715fbf324b2bcca58f82d30acad7f0906d084e3e654299bb5cc22a77547c7fa6a7ef0e6d1c135227cf69701d9
data/bin/proiel CHANGED
@@ -6,7 +6,7 @@ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
6
6
  require 'proiel/cli'
7
7
 
8
8
  Mercenary.program(:proiel) do |p|
9
- p.version PROIEL::VERSION
9
+ p.version PROIEL::CLI::VERSION
10
10
  p.description 'proiel is a command-line interface for PROIEL treebanks'
11
11
  p.syntax 'proiel <subcommand> [options]'
12
12
 
@@ -16,6 +16,9 @@ module PROIEL
16
16
  f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
17
17
  f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
18
18
  f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
19
+ f.option 'remove-alignments', '--remove-alignments', 'Remove alignments'
20
+ f.option 'remove-annotator', '--remove-annotator', 'Remove annotator information'
21
+ f.option 'remove-reviewer', '--remove-reviewer', 'Remove reviewer information'
19
22
  f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
20
23
  f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
21
24
  end
@@ -48,8 +48,9 @@ module PROIEL
48
48
  citation_part = nil
49
49
 
50
50
  body.each_with_index do |sd_body, i|
51
- builder.div(title: sd_body[:title]) do
52
- sd_body[:contents].split(/(@[^ ]+|§[^ ]+)/).map do |s|
51
+ builder.div do
52
+ builder.title sd_body[:title]
53
+ sd_body[:contents].split(/(@[^ ]+|§[^ ]+ )/).map do |s|
53
54
  if s[0] == '§' or s[0] == '@'
54
55
  s
55
56
  else
@@ -57,16 +58,16 @@ module PROIEL
57
58
  # sentence-breaking punctuation like periods and question marks, but
58
59
  # after the punctuation mark and characters typically used in pairs,
59
60
  # like brackets and apostrophes.
60
- s.gsub(/([\.:;\?!]+[\s†\]\)"']*)/, '\1|')
61
+ s.gsub(/([\.:;\?!]+[\s†\]\)"']*|\s*[\n\r]+)/, '\1|')
61
62
  end
62
63
  end.join.split('|').each_with_index do |s_body, j|
63
- builder.sentence(status_tag: 'unannotated') do
64
+ builder.sentence(status: 'unannotated') do
64
65
  leftover_before = ''
65
66
 
66
67
  # Preserve linebreaks in the text.
67
- s_body.gsub!(/\s*[\n\r]/, "\u2028")
68
+ s_body.gsub!(/\s*[\n\r]+/, "\u2028")
68
69
 
69
- s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+)([^@§\p{Word}]*)/).each do |(before, form, after)|
70
+ s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+ )([^@§\p{Word}]*)/).each do |(before, form, after)|
70
71
  case form
71
72
  when /^@(.*)$/
72
73
  leftover_before += before unless before.nil?
@@ -74,15 +75,15 @@ module PROIEL
74
75
  leftover_before += after unless after.nil?
75
76
  when /^§(.*)$/
76
77
  leftover_before += before unless before.nil?
77
- citation_part = $1
78
+ citation_part = $1.strip
78
79
  leftover_before += after unless after.nil?
79
80
  else
80
81
  before = leftover_before + before
81
82
  leftover_before = ''
82
83
 
83
- attrs = { citation_part: citation_part, form: form }
84
- attrs[:presentation_before] = before unless before == ''
85
- attrs[:presentation_after] = after unless after == ''
84
+ attrs = { :"citation-part" => citation_part, form: form }
85
+ attrs[:"presentation-before"] = before unless before == ''
86
+ attrs[:"presentation-after"] = after unless after == ''
86
87
 
87
88
  builder.token(attrs)
88
89
  end
@@ -0,0 +1,36 @@
1
+ module PROIEL
2
+ module Commands
3
+ class Words < Command
4
+ class << self
5
+ def init_with_program(prog)
6
+ prog.command(:words) do |c|
7
+ c.syntax 'words [options] filename(s)'
8
+ c.description 'Extract a word list'
9
+
10
+ c.action do |args, options|
11
+ if args.empty?
12
+ STDERR.puts 'Missing filename(s). Use --help for more information.'
13
+ else
14
+ process(args, options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
20
+ def process(args, options)
21
+ tb = PROIEL::Treebank.new
22
+
23
+ args.each do |filename|
24
+ STDERR.puts "Reading #{filename}...".green if options['verbose']
25
+
26
+ tb.load_from_xml(filename)
27
+ end
28
+
29
+ tb.sources.map { |s| s.tokens.map(&:form) }.flatten.sort.uniq.each do |form|
30
+ STDOUT.puts form
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -57,6 +57,7 @@ module PROIEL
57
57
  t.relation,
58
58
  t.empty_token_sort,
59
59
  t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
60
+ t.citation_part,
60
61
  self
61
62
  )
62
63
  end
@@ -164,8 +165,9 @@ module PROIEL
164
165
  attr_reader :language
165
166
  attr_reader :empty_token_sort
166
167
  attr_reader :form
168
+ attr_reader :citation_part
167
169
 
168
- def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, sentence)
170
+ def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence)
169
171
  @id = id
170
172
  @head_id = head_id
171
173
  @form = form
@@ -178,6 +180,7 @@ module PROIEL
178
180
  @slashes = slashes
179
181
  @sentence = sentence
180
182
  @features = (morphology ? map_morphology(morphology) : '' )
183
+ @citation_part = "ref=" + (citation_part ? citation_part : "").gsub(/\s/, '_')
181
184
  @upos = nil
182
185
  end
183
186
 
@@ -351,7 +354,7 @@ module PROIEL
351
354
  @head_id,
352
355
  (@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
353
356
  '_', # slashes here
354
- '_'].join("\t")
357
+ @citation_part].join("\t")
355
358
  end
356
359
 
357
360
  def to_s
@@ -5,7 +5,7 @@ module PROIEL
5
5
  def process(tb, options)
6
6
  builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
7
7
  builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
8
- builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.0') do
8
+ builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
9
9
  builder.annotation do
10
10
  builder.relations do
11
11
  tb.annotation_schema.relation_tags.each do |tag, value|
@@ -45,14 +45,24 @@ module PROIEL
45
45
  end
46
46
 
47
47
  tb.sources.each do |source|
48
- builder.source(id: source.id, language: source.language) do
48
+ mandatory_features = %i(id language)
49
+ optional_features = []
50
+ optional_features += %i(alignment_id) unless options['remove-alignments']
51
+
52
+ builder.source(grab_features(source, mandatory_features, optional_features)) do
49
53
  PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
50
54
  builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
51
55
  end
52
56
 
53
57
  source.divs.each do |div|
54
58
  if include_div?(div, options)
55
- builder.div(grab_features(div, %i(), %i(presentation_before presentation_after))) do
59
+ mandatory_features = %i()
60
+
61
+ optional_features = []
62
+ optional_features += %i(presentation_before presentation_after)
63
+ optional_features += %i(alignment_id) unless options['remove-alignments']
64
+
65
+ builder.div(grab_features(div, mandatory_features, optional_features)) do
56
66
  builder.title div.title if div.title
57
67
 
58
68
  div.sentences.each do |sentence|
@@ -62,6 +72,11 @@ module PROIEL
62
72
  optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
63
73
  optional_features += %i(status) unless options['remove-status']
64
74
  optional_features += %i(presentation_before presentation_after)
75
+ optional_features += %i(alignment_id) unless options['remove-alignments']
76
+ optional_features += %i(annotated_at) unless options['remove-annotator']
77
+ optional_features += %i(reviewed_at) unless options['remove-reviewer']
78
+ optional_features += %i(annotated_by) unless options['remove-annotator']
79
+ optional_features += %i(reviewed_by) unless options['remove-reviewer']
65
80
 
66
81
  builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
67
82
  sentence.tokens.each do |token|
@@ -83,6 +98,8 @@ module PROIEL
83
98
  mandatory_features << :empty_token_sort
84
99
  end
85
100
 
101
+ optional_features += %i(alignment_id) unless options['remove-alignments']
102
+
86
103
  attrs = grab_features(token, mandatory_features, optional_features)
87
104
 
88
105
  unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
@@ -1,5 +1,5 @@
1
1
  module PROIEL
2
2
  module CLI
3
- VERSION = '0.1.1'
3
+ VERSION = '1.0.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proiel-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marius L. Jøhndal
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-12-06 00:00:00.000000000 Z
12
+ date: 2016-06-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: builder
@@ -59,14 +59,14 @@ dependencies:
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '1.0'
62
+ version: '1.1'
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: '1.0'
69
+ version: '1.1'
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: bundler
72
72
  requirement: !ruby/object:Gem::Requirement
@@ -204,6 +204,7 @@ files:
204
204
  - lib/proiel/cli/commands/info.rb
205
205
  - lib/proiel/cli/commands/tokenize.rb
206
206
  - lib/proiel/cli/commands/validate.rb
207
+ - lib/proiel/cli/commands/words.rb
207
208
  - lib/proiel/cli/converters/conll-u.rb
208
209
  - lib/proiel/cli/converters/conll-u/morphology.rb
209
210
  - lib/proiel/cli/converters/conll-u/syntax.rb
@@ -235,7 +236,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
235
236
  version: '0'
236
237
  requirements: []
237
238
  rubyforge_project:
238
- rubygems_version: 2.4.5.1
239
+ rubygems_version: 2.5.1
239
240
  signing_key:
240
241
  specification_version: 4
241
242
  summary: A command-line interface for working with PROIEL treebanks