proiel-cli 0.1.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bfb4db333ae791d4171490c7b3d316364b205729
4
- data.tar.gz: fd001265bdcc8e75e49fc1e2a188f7c63471bb62
3
+ metadata.gz: cf5056d98706541003c897f25b470145a26dab43
4
+ data.tar.gz: 3bc9d72f373deec98554ec4bab9d0482f427a6b6
5
5
  SHA512:
6
- metadata.gz: 29cb26e28f22486db097b982d7e2cd4783f2b38d8d6c13fb06309fb25377791cd8248c88627b7d5b72c3adfe0e958c9696b2508fcb975e9a8fdf516a9825963b
7
- data.tar.gz: b565292467456b10415039f0464bdfafcae5fecf55747dd3b9ec2473313ba08789d49431ecb06d649596bfd98031640481efccd9b5be58b6b565d13dcc3a1161
6
+ metadata.gz: 6f4a512e9c4f35ccfdd84c4ec1c7d2bed4c5089862802c6171247817b64bb00ba9a3dadf2e82b7ba12bee2b647b86d0faf705e402e34ba6f74c0bdc3a8547796
7
+ data.tar.gz: c14b7972c896f8f47dbf764b69796c011c9827b715fbf324b2bcca58f82d30acad7f0906d084e3e654299bb5cc22a77547c7fa6a7ef0e6d1c135227cf69701d9
data/bin/proiel CHANGED
@@ -6,7 +6,7 @@ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
6
6
  require 'proiel/cli'
7
7
 
8
8
  Mercenary.program(:proiel) do |p|
9
- p.version PROIEL::VERSION
9
+ p.version PROIEL::CLI::VERSION
10
10
  p.description 'proiel is a command-line interface for PROIEL treebanks'
11
11
  p.syntax 'proiel <subcommand> [options]'
12
12
 
@@ -16,6 +16,9 @@ module PROIEL
16
16
  f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
17
17
  f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
18
18
  f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
19
+ f.option 'remove-alignments', '--remove-alignments', 'Remove alignments'
20
+ f.option 'remove-annotator', '--remove-annotator', 'Remove annotator information'
21
+ f.option 'remove-reviewer', '--remove-reviewer', 'Remove reviewer information'
19
22
  f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
20
23
  f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
21
24
  end
@@ -48,8 +48,9 @@ module PROIEL
48
48
  citation_part = nil
49
49
 
50
50
  body.each_with_index do |sd_body, i|
51
- builder.div(title: sd_body[:title]) do
52
- sd_body[:contents].split(/(@[^ ]+|§[^ ]+)/).map do |s|
51
+ builder.div do
52
+ builder.title sd_body[:title]
53
+ sd_body[:contents].split(/(@[^ ]+|§[^ ]+ )/).map do |s|
53
54
  if s[0] == '§' or s[0] == '@'
54
55
  s
55
56
  else
@@ -57,16 +58,16 @@ module PROIEL
57
58
  # sentence-breaking punctuation like periods and question marks, but
58
59
  # after the punctuation mark and characters typically used in pairs,
59
60
  # like brackets and apostrophes.
60
- s.gsub(/([\.:;\?!]+[\s†\]\)"']*)/, '\1|')
61
+ s.gsub(/([\.:;\?!]+[\s†\]\)"']*|\s*[\n\r]+)/, '\1|')
61
62
  end
62
63
  end.join.split('|').each_with_index do |s_body, j|
63
- builder.sentence(status_tag: 'unannotated') do
64
+ builder.sentence(status: 'unannotated') do
64
65
  leftover_before = ''
65
66
 
66
67
  # Preserve linebreaks in the text.
67
- s_body.gsub!(/\s*[\n\r]/, "\u2028")
68
+ s_body.gsub!(/\s*[\n\r]+/, "\u2028")
68
69
 
69
- s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+)([^@§\p{Word}]*)/).each do |(before, form, after)|
70
+ s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+ )([^@§\p{Word}]*)/).each do |(before, form, after)|
70
71
  case form
71
72
  when /^@(.*)$/
72
73
  leftover_before += before unless before.nil?
@@ -74,15 +75,15 @@ module PROIEL
74
75
  leftover_before += after unless after.nil?
75
76
  when /^§(.*)$/
76
77
  leftover_before += before unless before.nil?
77
- citation_part = $1
78
+ citation_part = $1.strip
78
79
  leftover_before += after unless after.nil?
79
80
  else
80
81
  before = leftover_before + before
81
82
  leftover_before = ''
82
83
 
83
- attrs = { citation_part: citation_part, form: form }
84
- attrs[:presentation_before] = before unless before == ''
85
- attrs[:presentation_after] = after unless after == ''
84
+ attrs = { :"citation-part" => citation_part, form: form }
85
+ attrs[:"presentation-before"] = before unless before == ''
86
+ attrs[:"presentation-after"] = after unless after == ''
86
87
 
87
88
  builder.token(attrs)
88
89
  end
@@ -0,0 +1,36 @@
1
+ module PROIEL
2
+ module Commands
3
+ class Words < Command
4
+ class << self
5
+ def init_with_program(prog)
6
+ prog.command(:words) do |c|
7
+ c.syntax 'words [options] filename(s)'
8
+ c.description 'Extract a word list'
9
+
10
+ c.action do |args, options|
11
+ if args.empty?
12
+ STDERR.puts 'Missing filename(s). Use --help for more information.'
13
+ else
14
+ process(args, options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
20
+ def process(args, options)
21
+ tb = PROIEL::Treebank.new
22
+
23
+ args.each do |filename|
24
+ STDERR.puts "Reading #{filename}...".green if options['verbose']
25
+
26
+ tb.load_from_xml(filename)
27
+ end
28
+
29
+ tb.sources.map { |s| s.tokens.map(&:form) }.flatten.sort.uniq.each do |form|
30
+ STDOUT.puts form
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -57,6 +57,7 @@ module PROIEL
57
57
  t.relation,
58
58
  t.empty_token_sort,
59
59
  t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
60
+ t.citation_part,
60
61
  self
61
62
  )
62
63
  end
@@ -164,8 +165,9 @@ module PROIEL
164
165
  attr_reader :language
165
166
  attr_reader :empty_token_sort
166
167
  attr_reader :form
168
+ attr_reader :citation_part
167
169
 
168
- def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, sentence)
170
+ def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence)
169
171
  @id = id
170
172
  @head_id = head_id
171
173
  @form = form
@@ -178,6 +180,7 @@ module PROIEL
178
180
  @slashes = slashes
179
181
  @sentence = sentence
180
182
  @features = (morphology ? map_morphology(morphology) : '' )
183
+ @citation_part = "ref=" + (citation_part ? citation_part : "").gsub(/\s/, '_')
181
184
  @upos = nil
182
185
  end
183
186
 
@@ -351,7 +354,7 @@ module PROIEL
351
354
  @head_id,
352
355
  (@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
353
356
  '_', # slashes here
354
- '_'].join("\t")
357
+ @citation_part].join("\t")
355
358
  end
356
359
 
357
360
  def to_s
@@ -5,7 +5,7 @@ module PROIEL
5
5
  def process(tb, options)
6
6
  builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
7
7
  builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
8
- builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.0') do
8
+ builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
9
9
  builder.annotation do
10
10
  builder.relations do
11
11
  tb.annotation_schema.relation_tags.each do |tag, value|
@@ -45,14 +45,24 @@ module PROIEL
45
45
  end
46
46
 
47
47
  tb.sources.each do |source|
48
- builder.source(id: source.id, language: source.language) do
48
+ mandatory_features = %i(id language)
49
+ optional_features = []
50
+ optional_features += %i(alignment_id) unless options['remove-alignments']
51
+
52
+ builder.source(grab_features(source, mandatory_features, optional_features)) do
49
53
  PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
50
54
  builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
51
55
  end
52
56
 
53
57
  source.divs.each do |div|
54
58
  if include_div?(div, options)
55
- builder.div(grab_features(div, %i(), %i(presentation_before presentation_after))) do
59
+ mandatory_features = %i()
60
+
61
+ optional_features = []
62
+ optional_features += %i(presentation_before presentation_after)
63
+ optional_features += %i(alignment_id) unless options['remove-alignments']
64
+
65
+ builder.div(grab_features(div, mandatory_features, optional_features)) do
56
66
  builder.title div.title if div.title
57
67
 
58
68
  div.sentences.each do |sentence|
@@ -62,6 +72,11 @@ module PROIEL
62
72
  optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
63
73
  optional_features += %i(status) unless options['remove-status']
64
74
  optional_features += %i(presentation_before presentation_after)
75
+ optional_features += %i(alignment_id) unless options['remove-alignments']
76
+ optional_features += %i(annotated_at) unless options['remove-annotator']
77
+ optional_features += %i(reviewed_at) unless options['remove-reviewer']
78
+ optional_features += %i(annotated_by) unless options['remove-annotator']
79
+ optional_features += %i(reviewed_by) unless options['remove-reviewer']
65
80
 
66
81
  builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
67
82
  sentence.tokens.each do |token|
@@ -83,6 +98,8 @@ module PROIEL
83
98
  mandatory_features << :empty_token_sort
84
99
  end
85
100
 
101
+ optional_features += %i(alignment_id) unless options['remove-alignments']
102
+
86
103
  attrs = grab_features(token, mandatory_features, optional_features)
87
104
 
88
105
  unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
@@ -1,5 +1,5 @@
1
1
  module PROIEL
2
2
  module CLI
3
- VERSION = '0.1.1'
3
+ VERSION = '1.0.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proiel-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marius L. Jøhndal
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-12-06 00:00:00.000000000 Z
12
+ date: 2016-06-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: builder
@@ -59,14 +59,14 @@ dependencies:
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '1.0'
62
+ version: '1.1'
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: '1.0'
69
+ version: '1.1'
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: bundler
72
72
  requirement: !ruby/object:Gem::Requirement
@@ -204,6 +204,7 @@ files:
204
204
  - lib/proiel/cli/commands/info.rb
205
205
  - lib/proiel/cli/commands/tokenize.rb
206
206
  - lib/proiel/cli/commands/validate.rb
207
+ - lib/proiel/cli/commands/words.rb
207
208
  - lib/proiel/cli/converters/conll-u.rb
208
209
  - lib/proiel/cli/converters/conll-u/morphology.rb
209
210
  - lib/proiel/cli/converters/conll-u/syntax.rb
@@ -235,7 +236,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
235
236
  version: '0'
236
237
  requirements: []
237
238
  rubyforge_project:
238
- rubygems_version: 2.4.5.1
239
+ rubygems_version: 2.5.1
239
240
  signing_key:
240
241
  specification_version: 4
241
242
  summary: A command-line interface for working with PROIEL treebanks