slaw 9.2.0 → 10.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0654597e1ede427474f5b9b4f703070c4d23fc34fdbf10700ae72485f8372a21'
4
- data.tar.gz: d19cd4ebfe1e256f5366addde1723817547075e6468ab31b0d65c8d492f5c6d1
3
+ metadata.gz: e20546bab171b4b78f42ebd1d1a38bc246253a25423ff48067483f15d4806f71
4
+ data.tar.gz: 4f5899aa5565c5a70714c2edb78774468d79620d992deef7c66991c526f63a8a
5
5
  SHA512:
6
- metadata.gz: a778d4798462049e8fbb123c30d72d2e43c4f7f8344e0bc31590d2c5873daa3675979cafdb3fc14be1327f3ed262c19e6220b2e2ed40e825cc68fe353bb57614
7
- data.tar.gz: 585ea851576bca2c5059a3693067066f9f06a9b1ef67761534b707ddcd51f447cc8953f7f7b33842486cb77e442b3f7d60dd63e763306d269db14f1c1b17fcaa
6
+ metadata.gz: e3d8d3dd989a852d7950deb9bc3172788f475fdfa9b6342d0d92d1ba8f60ad973ed3efc7029086e80a915672e8098e99cbbd3ad654692a103e222e3f480f53bd
7
+ data.tar.gz: c605715d80d4704c619c19dd5578ed70189c302ad77d7adce70fee6b52c57b095570b041f43de24a9705fd80da924274dcb21217e2df8442b352ce082a1c4bf2
data/.travis.yml CHANGED
@@ -2,6 +2,5 @@ language: ruby
2
2
  rvm:
3
3
  - 2.7.0
4
4
  - 2.6.2
5
- - 2.5.4
6
5
  before_install:
7
6
  - gem update bundler
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Slaw [![Build Status](https://travis-ci.org/longhotsummer/slaw.svg)](http://travis-ci.org/longhotsummer/slaw) [![Gem Version](https://badge.fury.io/rb/slaw.svg)](https://badge.fury.io/rb/slaw)
2
2
 
3
- Slaw is a lightweight library for generating Akoma Ntoso 2.0 Act XML from plain text documents.
3
+ Slaw is a lightweight library for generating Akoma Ntoso 3.0 Act XML from plain text documents.
4
4
  It is used to power [Indigo](https://github.com/laws-africa/indigo) and uses grammars developed for the legal
5
5
  tradition in South Africa, although others traditions are supported.
6
6
 
@@ -86,6 +86,32 @@ You can create your own grammar by creating a gem that provides these files and
86
86
 
87
87
  ## Changelog
88
88
 
89
+ ### 10.4.0 (9 April 2021)
90
+
91
+ * Remove dependency on mimemagic. Guess file type based on filename instead.
92
+
93
+ ### 10.3.1 (11 January 2021)
94
+
95
+ * Strip ascii, unicode general and unicode supplemental punctuation from num elements when building eIds
96
+
97
+ ### 10.2.0 (4 September 2020)
98
+
99
+ * support inline superscript `^^text^^`
100
+ * support inline subscript `_^text^_`
101
+
102
+ ### 10.1.0 (18 June 2020)
103
+
104
+ * hcontainer elements have name attributes, to be compliant with AKN 3.0
105
+
106
+ ### 10.0.0 (12 June 2020)
107
+
108
+ * BREAKING: Create XML with AKN 3 namespace (http://docs.oasis-open.org/legaldocml/ns/akn/3.0), AKN2 is no longer supported
109
+ * BREAKING: replace id attributes with eId attributes
110
+ * BREAKING: serialize schedules as attachments to act, not as components as peers of the act
111
+ * BREAKING: anonymous blocks are serialized as hcontainers, not paragraphs
112
+ * BREAKING: crossheading hcontainer IDs correctly use hcontainer
113
+ * Remove unnecessary schemaLocation header in root element
114
+
89
115
  ### 9.2.0 (10 June 2020)
90
116
 
91
117
  * Subpart numbers are optional
data/bin/slaw CHANGED
@@ -19,6 +19,7 @@ class SlawCLI < Thor
19
19
  option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
20
20
  option :grammar, type: :string, desc: "Grammar name (usually a two-letter country code). Default is za."
21
21
  option :ascii, type: :boolean, default: false, desc: "Process text as ASCII using %-encoding. This can provide significant speed improvements if the grammar uses only ASCII literals. See https://github.com/cjheath/treetop/issues/31."
22
+ option :namespace, enum: ['akn3'], default: 'akn3', desc: 'AKN XML namespace to use.'
22
23
  def parse(name)
23
24
  logging
24
25
 
@@ -33,6 +34,11 @@ class SlawCLI < Thor
33
34
  text = extractor.extract_from_file(name)
34
35
  end
35
36
 
37
+ case options[:namespace]
38
+ when 'akn3'
39
+ Slaw.akn_namespace = Slaw::AKN3_NS
40
+ end
41
+
36
42
  generator = Slaw::ActGenerator.new(options[:grammar] || 'za')
37
43
 
38
44
  if options[:fragment]
@@ -49,7 +55,7 @@ class SlawCLI < Thor
49
55
 
50
56
  if options[:id_prefix]
51
57
  prefix = options[:id_prefix]
52
- prefix += "." unless prefix.end_with?('.')
58
+ prefix += "__" unless prefix.end_with?('__')
53
59
  generator.builder.fragment_id_prefix = prefix
54
60
  end
55
61
  end
@@ -1,5 +1,3 @@
1
- require 'mimemagic'
2
-
3
1
  module Slaw
4
2
  module Extract
5
3
 
@@ -13,15 +11,10 @@ module Slaw
13
11
  #
14
12
  # @return [String] extracted text
15
13
  def extract_from_file(filename)
16
- mimetype = get_mimetype(filename)
17
-
18
- case mimetype && mimetype.type
19
- when 'text/html'
14
+ if filename.end_with? '.html' or filename.end_with? '.htm'
20
15
  extract_from_html(filename)
21
- when 'text/plain', nil
22
- extract_from_text(filename)
23
16
  else
24
- raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
17
+ extract_from_text(filename)
25
18
  end
26
19
  end
27
20
 
@@ -4,16 +4,14 @@ module Slaw
4
4
  # Counters for generating element IDs. This is a hash from the element ID
5
5
  # prefix, to another hash that maps the element type name to a count.
6
6
  #
7
- # For backwards compatibility, counters always start at -1, and must be
8
- # incremented before being used. This ensures that element ids start at 0.
9
- # This is NOT compatible with AKN 3.0 which requires that element numbers
10
- # start at 1.
7
+ # Counters always start at 0, and must be incremented before being used.
8
+ # This ensures that element ids start at 1, as per AKN 3.0 spec.
11
9
  #
12
10
  # eg.
13
11
  #
14
12
  # section-1 => paragraph => 2
15
13
  #
16
- @@counters = Hash.new{ |h, k| h[k] = Hash.new(-1) }
14
+ @@counters = Hash.new{ |h, k| h[k] = Hash.new(0) }
17
15
 
18
16
  def self.counters
19
17
  @@counters
@@ -22,6 +20,37 @@ module Slaw
22
20
  def self.reset!
23
21
  @@counters.clear
24
22
  end
23
+
24
+ # Clean a <num> value for use in an eId
25
+ # See https://docs.oasis-open.org/legaldocml/akn-nc/v1.0/os/akn-nc-v1.0-os.html#_Toc531692306
26
+ #
27
+ # "The number part of the identifiers of such elements corresponds to the
28
+ # stripping of all final punctuation, meaningless separations as well as
29
+ # redundant characters in the content of the <num> element. The
30
+ # representation is case-sensitive."
31
+ #
32
+ # Our algorithm is:
33
+ # 1. strip all leading and trailing whitespace and punctuation (using the unicode punctuation blocks)
34
+ # 2. strip all whitespace
35
+ # 3. replace all remaining punctuation with a hyphen.
36
+ #
37
+ # The General Punctuation block is \u2000-\u206F, and the Supplemental Punctuation block is \u2E00-\u2E7F.
38
+ #
39
+ # (i) -> i
40
+ # 1.2. -> 1-2
41
+ # “2.3“ -> 2-3
42
+ # 3a bis -> 3abis
43
+ def self.clean(num)
44
+ # leading whitespace and punctuation
45
+ num = num.gsub(/^[\s\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+/, '')
46
+ # trailing whitespace and punctuation
47
+ num.gsub!(/[\s\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+$/, '')
48
+ # whitespace
49
+ num.gsub!(/\s/, '')
50
+ # remaining punctuation to a hyphen
51
+ num.gsub!(/[\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+/, '-')
52
+ num
53
+ end
25
54
  end
26
55
  end
27
56
  end
@@ -20,7 +20,7 @@ module Slaw
20
20
  end
21
21
 
22
22
  rule inline_item
23
- remark / image / ref / bold / italics / [^\n]
23
+ remark / image / ref / bold / italics / superscript / subscript / [^\n]
24
24
  <InlineItem>
25
25
  end
26
26
 
@@ -57,6 +57,18 @@ module Slaw
57
57
  <Ref>
58
58
  end
59
59
 
60
+ rule superscript
61
+ # ^^foo^^
62
+ '^^' content:(!'^^' inline_item)+ '^^'
63
+ <Superscript>
64
+ end
65
+
66
+ rule subscript
67
+ # _^foo^_
68
+ '_^' content:(!'^_' inline_item)+ '^_'
69
+ <Subscript>
70
+ end
71
+
60
72
  end
61
73
  end
62
74
  end
@@ -71,6 +71,26 @@ module Slaw
71
71
  end
72
72
  end
73
73
 
74
+ class Superscript < Treetop::Runtime::SyntaxNode
75
+ def to_xml(b, idprefix)
76
+ b.sup { |b|
77
+ for e in content.elements
78
+ e.inline_item.to_xml(b, idprefix)
79
+ end
80
+ }
81
+ end
82
+ end
83
+
84
+ class Subscript < Treetop::Runtime::SyntaxNode
85
+ def to_xml(b, idprefix)
86
+ b.sub { |b|
87
+ for e in content.elements
88
+ e.inline_item.to_xml(b, idprefix)
89
+ end
90
+ }
91
+ end
92
+ end
93
+
74
94
  end
75
95
  end
76
96
  end
@@ -10,7 +10,7 @@ module Slaw
10
10
 
11
11
  class ScheduleContainer < Treetop::Runtime::SyntaxNode
12
12
  def to_xml(b, idprefix="")
13
- b.components { |b|
13
+ b.attachments { |b|
14
14
  schedules.children.elements.each_with_index { |e, i|
15
15
  e.to_xml(b, idprefix, i+1)
16
16
  }
@@ -86,6 +86,9 @@ module Slaw
86
86
  end
87
87
 
88
88
  def to_xml(b, idprefix=nil, i=1)
89
+ # reset counters for this new schedule document
90
+ Slaw::Grammars::Counters.reset!
91
+
89
92
  heading_text = self.schedule_title.heading_text
90
93
  if not heading_text
91
94
  heading_text = "Schedule"
@@ -95,12 +98,13 @@ module Slaw
95
98
  # the schedule id is derived from the heading
96
99
  schedule_id = self.schedule_id(heading_text, i)
97
100
 
98
- b.component(id: "component-#{schedule_id}") { |b|
99
- b.doc_(name: schedule_id) { |b|
101
+ b.attachment(eId: "att_#{i}") { |b|
102
+ schedule_title.to_xml(b, '', heading_text)
103
+ b.doc_(name: "schedule") { |b|
100
104
  b.meta { |b|
101
105
  b.identification(source: "#slaw") { |b|
102
106
  b.FRBRWork { |b|
103
- b.FRBRthis(value: "#{WORK_URI}/#{schedule_id}")
107
+ b.FRBRthis(value: "#{WORK_URI}/!#{schedule_id}")
104
108
  b.FRBRuri(value: WORK_URI)
105
109
  b.FRBRalias(value: heading_text)
106
110
  b.FRBRdate(date: '1980-01-01', name: 'Generation')
@@ -108,14 +112,14 @@ module Slaw
108
112
  b.FRBRcountry(value: 'za')
109
113
  }
110
114
  b.FRBRExpression { |b|
111
- b.FRBRthis(value: "#{EXPRESSION_URI}/#{schedule_id}")
115
+ b.FRBRthis(value: "#{EXPRESSION_URI}/!#{schedule_id}")
112
116
  b.FRBRuri(value: EXPRESSION_URI)
113
117
  b.FRBRdate(date: '1980-01-01', name: 'Generation')
114
118
  b.FRBRauthor(href: '#council')
115
119
  b.FRBRlanguage(language: 'eng')
116
120
  }
117
121
  b.FRBRManifestation { |b|
118
- b.FRBRthis(value: "#{MANIFESTATION_URI}/#{schedule_id}")
122
+ b.FRBRthis(value: "#{MANIFESTATION_URI}/!#{schedule_id}")
119
123
  b.FRBRuri(value: MANIFESTATION_URI)
120
124
  b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
121
125
  b.FRBRauthor(href: '#slaw')
@@ -124,14 +128,7 @@ module Slaw
124
128
  }
125
129
 
126
130
  b.mainBody { |b|
127
- idprefix = "#{schedule_id}."
128
-
129
- # there is no good AKN hierarchy container for schedules, so we
130
- # use hcontainer instead
131
- b.hcontainer(id: schedule_id, name: "schedule") { |b|
132
- schedule_title.to_xml(b, idprefix, heading_text)
133
- body.children.elements.each_with_index { |e| e.to_xml(b, idprefix, i) } if body.is_a? Body
134
- }
131
+ body.children.elements.each_with_index { |e| e.to_xml(b, '', i) } if body.is_a? Body
135
132
  }
136
133
  }
137
134
  }
@@ -3,7 +3,9 @@ module Slaw
3
3
  module Tables
4
4
  class Table < Treetop::Runtime::SyntaxNode
5
5
  def to_xml(b, idprefix, i=0)
6
- b.table(id: "#{idprefix}table#{i}") { |b|
6
+ cnt = Slaw::Grammars::Counters.counters[idprefix]['table'] += 1
7
+
8
+ b.table(eId: "#{idprefix}table_#{cnt}") { |b|
7
9
  # we'll gather cells into this row list
8
10
  rows = []
9
11
  cells = []
@@ -12,13 +12,13 @@ module Slaw
12
12
  MANIFESTATION_URI = EXPRESSION_URI
13
13
 
14
14
  def to_xml(b, idprefix=nil, i=0)
15
- b.act(contains: "originalVersion") { |b|
15
+ b.act(contains: 'originalVersion', name: 'act') { |b|
16
16
  write_meta(b)
17
17
  write_preface(b)
18
18
  write_preamble(b)
19
19
  write_body(b)
20
+ write_schedules(b)
20
21
  }
21
- write_schedules(b)
22
22
  end
23
23
 
24
24
  def write_meta(b)
@@ -26,8 +26,8 @@ module Slaw
26
26
  write_identification(b)
27
27
 
28
28
  b.references(source: "#this") {
29
- b.TLCOrganization(id: 'slaw', href: 'https://github.com/longhotsummer/slaw', showAs: "Slaw")
30
- b.TLCOrganization(id: 'council', href: '/ontology/organization/za/council', showAs: "Council")
29
+ b.TLCOrganization(eId: 'slaw', href: 'https://github.com/longhotsummer/slaw', showAs: "Slaw")
30
+ b.TLCOrganization(eId: 'council', href: '/ontology/organization/za/council', showAs: "Council")
31
31
  }
32
32
  }
33
33
  end
@@ -38,7 +38,7 @@ module Slaw
38
38
  b.FRBRWork { |b|
39
39
  b.FRBRthis(value: "#{WORK_URI}/main")
40
40
  b.FRBRuri(value: WORK_URI)
41
- b.FRBRalias(value: 'Short Title')
41
+ b.FRBRalias(value: 'Short Title', name: 'title')
42
42
  b.FRBRdate(date: '1980-01-01', name: 'Generation')
43
43
  b.FRBRauthor(href: '#council')
44
44
  b.FRBRcountry(value: 'za')
@@ -125,7 +125,7 @@ module Slaw
125
125
  if !stmts.empty?
126
126
  b.preamble { |b|
127
127
  stmts.each { |e|
128
- e.preamble_statement.to_xml(b, "")
128
+ e.preamble_statement.to_xml(b, "preamble__")
129
129
  }
130
130
  }
131
131
  end
@@ -138,11 +138,11 @@ module Slaw
138
138
  end
139
139
 
140
140
  def to_xml(b, id_prefix='', *args)
141
- id = id_prefix + "part-#{num}"
141
+ id = id_prefix + "part_#{Slaw::Grammars::Counters.clean(num)}"
142
142
 
143
- b.part(id: id) { |b|
143
+ b.part(eId: id) { |b|
144
144
  heading.to_xml(b)
145
- children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
145
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '__', i) }
146
146
  }
147
147
  end
148
148
  end
@@ -171,13 +171,15 @@ module Slaw
171
171
  num = self.num
172
172
  if num.empty?
173
173
  num = Slaw::Grammars::Counters.counters[id_prefix]['subpart'] += 1
174
+ else
175
+ num = Slaw::Grammars::Counters.clean(num)
174
176
  end
175
177
 
176
- id = id_prefix + "subpart-#{num}"
178
+ id = id_prefix + "subpart_#{num}"
177
179
 
178
- b.subpart(id: id) { |b|
180
+ b.subpart(eId: id) { |b|
179
181
  heading.to_xml(b)
180
- children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
182
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '__', i) }
181
183
  }
182
184
  end
183
185
  end
@@ -203,11 +205,11 @@ module Slaw
203
205
  end
204
206
 
205
207
  def to_xml(b, id_prefix='', *args)
206
- id = id_prefix + "chapter-#{num}"
208
+ id = id_prefix + "chp_#{Slaw::Grammars::Counters.clean(num)}"
207
209
 
208
- b.chapter(id: id) { |b|
210
+ b.chapter(eId: id) { |b|
209
211
  heading.to_xml(b)
210
- children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
212
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '__', i) }
211
213
  }
212
214
  end
213
215
  end
@@ -233,11 +235,11 @@ module Slaw
233
235
  end
234
236
 
235
237
  def to_xml(b, *args)
236
- id = "section-#{num}"
237
- b.section(id: id) { |b|
238
+ id = "sec_#{Slaw::Grammars::Counters.clean(num)}"
239
+ b.section(eId: id) { |b|
238
240
  section_title.to_xml(b)
239
241
 
240
- idprefix = "#{id}."
242
+ idprefix = "#{id}__"
241
243
  children.elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
242
244
  }
243
245
  end
@@ -308,11 +310,11 @@ module Slaw
308
310
 
309
311
  class BlockElements < Treetop::Runtime::SyntaxNode
310
312
  def to_xml(b, idprefix='', i=0)
311
- cnt = Slaw::Grammars::Counters.counters[idprefix]['paragraph'] += 1
312
- id = "#{idprefix}paragraph#{cnt}"
313
- idprefix = "#{id}."
313
+ cnt = Slaw::Grammars::Counters.counters[idprefix]['hcontainer'] += 1
314
+ id = "#{idprefix}hcontainer_#{cnt}"
315
+ idprefix = "#{id}__"
314
316
 
315
- b.paragraph(id: id) { |b|
317
+ b.hcontainer(eId: id, name: 'hcontainer') { |b|
316
318
  b.content { |b|
317
319
  elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
318
320
  }
@@ -326,10 +328,10 @@ module Slaw
326
328
  end
327
329
 
328
330
  def to_xml(b, idprefix, i)
329
- id = idprefix + num.gsub(/[()]/, '')
330
- idprefix = id + "."
331
+ id = idprefix + "subsec_" + Slaw::Grammars::Counters.clean(num)
332
+ idprefix = id + "__"
331
333
 
332
- b.subsection(id: id) { |b|
334
+ b.subsection(eId: id) { |b|
333
335
  b.num(num)
334
336
  block_elements_with_inline.to_xml(b, idprefix)
335
337
  }
@@ -341,10 +343,10 @@ module Slaw
341
343
  # yield to it a builder to insert a listIntroduction node
342
344
  def to_xml(b, idprefix, i=0, &block)
343
345
  cnt = Slaw::Grammars::Counters.counters[idprefix]['list'] += 1
344
- id = idprefix + "list#{cnt}"
345
- idprefix = id + '.'
346
+ id = idprefix + "list_#{cnt}"
347
+ idprefix = id + '__'
346
348
 
347
- b.blockList(id: id, renest: true) { |b|
349
+ b.blockList(eId: id, renest: true) { |b|
348
350
  b.listIntroduction { |b| yield b } if block_given?
349
351
 
350
352
  elements.each { |e| e.to_xml(b, idprefix) }
@@ -358,7 +360,7 @@ module Slaw
358
360
  end
359
361
 
360
362
  def to_xml(b, idprefix)
361
- b.item(id: idprefix + num.gsub(/[()]/, '')) { |b|
363
+ b.item(eId: idprefix + "item_" + Slaw::Grammars::Counters.clean(num)) { |b|
362
364
  b.num(num)
363
365
  b.p { |b|
364
366
  item_content.inline_items.to_xml(b, idprefix) if respond_to? :item_content and item_content.respond_to? :inline_items
@@ -369,10 +371,10 @@ module Slaw
369
371
 
370
372
  class Crossheading < Treetop::Runtime::SyntaxNode
371
373
  def to_xml(b, idprefix, i=0)
372
- cnt = Slaw::Grammars::Counters.counters[idprefix]['crossheading'] += 1
373
- id = "#{idprefix}crossheading-#{cnt}"
374
+ cnt = Slaw::Grammars::Counters.counters[idprefix]['hcontainer'] += 1
375
+ id = "#{idprefix}hcontainer_#{cnt}"
374
376
 
375
- b.hcontainer(id: id, name: 'crossheading') { |b|
377
+ b.hcontainer(eId: id, name: 'crossheading') { |b|
376
378
  b.heading { |b|
377
379
  inline_items.to_xml(b, idprefix)
378
380
  }