slaw 9.2.0 → 10.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0654597e1ede427474f5b9b4f703070c4d23fc34fdbf10700ae72485f8372a21'
4
- data.tar.gz: d19cd4ebfe1e256f5366addde1723817547075e6468ab31b0d65c8d492f5c6d1
3
+ metadata.gz: e20546bab171b4b78f42ebd1d1a38bc246253a25423ff48067483f15d4806f71
4
+ data.tar.gz: 4f5899aa5565c5a70714c2edb78774468d79620d992deef7c66991c526f63a8a
5
5
  SHA512:
6
- metadata.gz: a778d4798462049e8fbb123c30d72d2e43c4f7f8344e0bc31590d2c5873daa3675979cafdb3fc14be1327f3ed262c19e6220b2e2ed40e825cc68fe353bb57614
7
- data.tar.gz: 585ea851576bca2c5059a3693067066f9f06a9b1ef67761534b707ddcd51f447cc8953f7f7b33842486cb77e442b3f7d60dd63e763306d269db14f1c1b17fcaa
6
+ metadata.gz: e3d8d3dd989a852d7950deb9bc3172788f475fdfa9b6342d0d92d1ba8f60ad973ed3efc7029086e80a915672e8098e99cbbd3ad654692a103e222e3f480f53bd
7
+ data.tar.gz: c605715d80d4704c619c19dd5578ed70189c302ad77d7adce70fee6b52c57b095570b041f43de24a9705fd80da924274dcb21217e2df8442b352ce082a1c4bf2
data/.travis.yml CHANGED
@@ -2,6 +2,5 @@ language: ruby
2
2
  rvm:
3
3
  - 2.7.0
4
4
  - 2.6.2
5
- - 2.5.4
6
5
  before_install:
7
6
  - gem update bundler
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Slaw [![Build Status](https://travis-ci.org/longhotsummer/slaw.svg)](http://travis-ci.org/longhotsummer/slaw) [![Gem Version](https://badge.fury.io/rb/slaw.svg)](https://badge.fury.io/rb/slaw)
2
2
 
3
- Slaw is a lightweight library for generating Akoma Ntoso 2.0 Act XML from plain text documents.
3
+ Slaw is a lightweight library for generating Akoma Ntoso 3.0 Act XML from plain text documents.
4
4
  It is used to power [Indigo](https://github.com/laws-africa/indigo) and uses grammars developed for the legal
5
5
  tradition in South Africa, although others traditions are supported.
6
6
 
@@ -86,6 +86,32 @@ You can create your own grammar by creating a gem that provides these files and
86
86
 
87
87
  ## Changelog
88
88
 
89
+ ### 10.4.0 (9 April 2021)
90
+
91
+ * Remove dependency on mimemagic. Guess file type based on filename instead.
92
+
93
+ ### 10.3.1 (11 January 2021)
94
+
95
+ * Strip ascii, unicode general and unicode supplemental punctuation from num elements when building eIds
96
+
97
+ ### 10.2.0 (4 September 2020)
98
+
99
+ * support inline superscript `^^text^^`
100
+ * support inline subscript `_^text^_`
101
+
102
+ ### 10.1.0 (18 June 2020)
103
+
104
+ * hcontainer elements have name attributes, to be compliant with AKN 3.0
105
+
106
+ ### 10.0.0 (12 June 2020)
107
+
108
+ * BREAKING: Create XML with AKN 3 namespace (http://docs.oasis-open.org/legaldocml/ns/akn/3.0), AKN2 is no longer supported
109
+ * BREAKING: replace id attributes with eId attributes
110
+ * BREAKING: serialize schedules as attachments to act, not as components as peers of the act
111
+ * BREAKING: anonymous blocks are serialized as hcontainers, not paragraphs
112
+ * BREAKING: crossheading hcontainer IDs correctly use hcontainer
113
+ * Remove unnecessary schemaLocation header in root element
114
+
89
115
  ### 9.2.0 (10 June 2020)
90
116
 
91
117
  * Subpart numbers are optional
data/bin/slaw CHANGED
@@ -19,6 +19,7 @@ class SlawCLI < Thor
19
19
  option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
20
20
  option :grammar, type: :string, desc: "Grammar name (usually a two-letter country code). Default is za."
21
21
  option :ascii, type: :boolean, default: false, desc: "Process text as ASCII using %-encoding. This can provide significant speed improvements if the grammar uses only ASCII literals. See https://github.com/cjheath/treetop/issues/31."
22
+ option :namespace, enum: ['akn3'], default: 'akn3', desc: 'AKN XML namespace to use.'
22
23
  def parse(name)
23
24
  logging
24
25
 
@@ -33,6 +34,11 @@ class SlawCLI < Thor
33
34
  text = extractor.extract_from_file(name)
34
35
  end
35
36
 
37
+ case options[:namespace]
38
+ when 'akn3'
39
+ Slaw.akn_namespace = Slaw::AKN3_NS
40
+ end
41
+
36
42
  generator = Slaw::ActGenerator.new(options[:grammar] || 'za')
37
43
 
38
44
  if options[:fragment]
@@ -49,7 +55,7 @@ class SlawCLI < Thor
49
55
 
50
56
  if options[:id_prefix]
51
57
  prefix = options[:id_prefix]
52
- prefix += "." unless prefix.end_with?('.')
58
+ prefix += "__" unless prefix.end_with?('__')
53
59
  generator.builder.fragment_id_prefix = prefix
54
60
  end
55
61
  end
@@ -1,5 +1,3 @@
1
- require 'mimemagic'
2
-
3
1
  module Slaw
4
2
  module Extract
5
3
 
@@ -13,15 +11,10 @@ module Slaw
13
11
  #
14
12
  # @return [String] extracted text
15
13
  def extract_from_file(filename)
16
- mimetype = get_mimetype(filename)
17
-
18
- case mimetype && mimetype.type
19
- when 'text/html'
14
+ if filename.end_with? '.html' or filename.end_with? '.htm'
20
15
  extract_from_html(filename)
21
- when 'text/plain', nil
22
- extract_from_text(filename)
23
16
  else
24
- raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
17
+ extract_from_text(filename)
25
18
  end
26
19
  end
27
20
 
@@ -4,16 +4,14 @@ module Slaw
4
4
  # Counters for generating element IDs. This is a hash from the element ID
5
5
  # prefix, to another hash that maps the element type name to a count.
6
6
  #
7
- # For backwards compatibility, counters always start at -1, and must be
8
- # incremented before being used. This ensures that element ids start at 0.
9
- # This is NOT compatible with AKN 3.0 which requires that element numbers
10
- # start at 1.
7
+ # Counters always start at 0, and must be incremented before being used.
8
+ # This ensures that element ids start at 1, as per AKN 3.0 spec.
11
9
  #
12
10
  # eg.
13
11
  #
14
12
  # section-1 => paragraph => 2
15
13
  #
16
- @@counters = Hash.new{ |h, k| h[k] = Hash.new(-1) }
14
+ @@counters = Hash.new{ |h, k| h[k] = Hash.new(0) }
17
15
 
18
16
  def self.counters
19
17
  @@counters
@@ -22,6 +20,37 @@ module Slaw
22
20
  def self.reset!
23
21
  @@counters.clear
24
22
  end
23
+
24
+ # Clean a <num> value for use in an eId
25
+ # See https://docs.oasis-open.org/legaldocml/akn-nc/v1.0/os/akn-nc-v1.0-os.html#_Toc531692306
26
+ #
27
+ # "The number part of the identifiers of such elements corresponds to the
28
+ # stripping of all final punctuation, meaningless separations as well as
29
+ # redundant characters in the content of the <num> element. The
30
+ # representation is case-sensitive."
31
+ #
32
+ # Our algorithm is:
33
+ # 1. strip all leading and trailing whitespace and punctuation (using the unicode punctuation blocks)
34
+ # 2. strip all whitespace
35
+ # 3. replace all remaining punctuation with a hyphen.
36
+ #
37
+ # The General Punctuation block is \u2000-\u206F, and the Supplemental Punctuation block is \u2E00-\u2E7F.
38
+ #
39
+ # (i) -> i
40
+ # 1.2. -> 1-2
41
+ # “2.3“ -> 2-3
42
+ # 3a bis -> 3abis
43
+ def self.clean(num)
44
+ # leading whitespace and punctuation
45
+ num = num.gsub(/^[\s\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+/, '')
46
+ # trailing whitespace and punctuation
47
+ num.gsub!(/[\s\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+$/, '')
48
+ # whitespace
49
+ num.gsub!(/\s/, '')
50
+ # remaining punctuation to a hyphen
51
+ num.gsub!(/[\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+/, '-')
52
+ num
53
+ end
25
54
  end
26
55
  end
27
56
  end
@@ -20,7 +20,7 @@ module Slaw
20
20
  end
21
21
 
22
22
  rule inline_item
23
- remark / image / ref / bold / italics / [^\n]
23
+ remark / image / ref / bold / italics / superscript / subscript / [^\n]
24
24
  <InlineItem>
25
25
  end
26
26
 
@@ -57,6 +57,18 @@ module Slaw
57
57
  <Ref>
58
58
  end
59
59
 
60
+ rule superscript
61
+ # ^^foo^^
62
+ '^^' content:(!'^^' inline_item)+ '^^'
63
+ <Superscript>
64
+ end
65
+
66
+ rule subscript
67
+ # _^foo^_
68
+ '_^' content:(!'^_' inline_item)+ '^_'
69
+ <Subscript>
70
+ end
71
+
60
72
  end
61
73
  end
62
74
  end
@@ -71,6 +71,26 @@ module Slaw
71
71
  end
72
72
  end
73
73
 
74
+ class Superscript < Treetop::Runtime::SyntaxNode
75
+ def to_xml(b, idprefix)
76
+ b.sup { |b|
77
+ for e in content.elements
78
+ e.inline_item.to_xml(b, idprefix)
79
+ end
80
+ }
81
+ end
82
+ end
83
+
84
+ class Subscript < Treetop::Runtime::SyntaxNode
85
+ def to_xml(b, idprefix)
86
+ b.sub { |b|
87
+ for e in content.elements
88
+ e.inline_item.to_xml(b, idprefix)
89
+ end
90
+ }
91
+ end
92
+ end
93
+
74
94
  end
75
95
  end
76
96
  end
@@ -10,7 +10,7 @@ module Slaw
10
10
 
11
11
  class ScheduleContainer < Treetop::Runtime::SyntaxNode
12
12
  def to_xml(b, idprefix="")
13
- b.components { |b|
13
+ b.attachments { |b|
14
14
  schedules.children.elements.each_with_index { |e, i|
15
15
  e.to_xml(b, idprefix, i+1)
16
16
  }
@@ -86,6 +86,9 @@ module Slaw
86
86
  end
87
87
 
88
88
  def to_xml(b, idprefix=nil, i=1)
89
+ # reset counters for this new schedule document
90
+ Slaw::Grammars::Counters.reset!
91
+
89
92
  heading_text = self.schedule_title.heading_text
90
93
  if not heading_text
91
94
  heading_text = "Schedule"
@@ -95,12 +98,13 @@ module Slaw
95
98
  # the schedule id is derived from the heading
96
99
  schedule_id = self.schedule_id(heading_text, i)
97
100
 
98
- b.component(id: "component-#{schedule_id}") { |b|
99
- b.doc_(name: schedule_id) { |b|
101
+ b.attachment(eId: "att_#{i}") { |b|
102
+ schedule_title.to_xml(b, '', heading_text)
103
+ b.doc_(name: "schedule") { |b|
100
104
  b.meta { |b|
101
105
  b.identification(source: "#slaw") { |b|
102
106
  b.FRBRWork { |b|
103
- b.FRBRthis(value: "#{WORK_URI}/#{schedule_id}")
107
+ b.FRBRthis(value: "#{WORK_URI}/!#{schedule_id}")
104
108
  b.FRBRuri(value: WORK_URI)
105
109
  b.FRBRalias(value: heading_text)
106
110
  b.FRBRdate(date: '1980-01-01', name: 'Generation')
@@ -108,14 +112,14 @@ module Slaw
108
112
  b.FRBRcountry(value: 'za')
109
113
  }
110
114
  b.FRBRExpression { |b|
111
- b.FRBRthis(value: "#{EXPRESSION_URI}/#{schedule_id}")
115
+ b.FRBRthis(value: "#{EXPRESSION_URI}/!#{schedule_id}")
112
116
  b.FRBRuri(value: EXPRESSION_URI)
113
117
  b.FRBRdate(date: '1980-01-01', name: 'Generation')
114
118
  b.FRBRauthor(href: '#council')
115
119
  b.FRBRlanguage(language: 'eng')
116
120
  }
117
121
  b.FRBRManifestation { |b|
118
- b.FRBRthis(value: "#{MANIFESTATION_URI}/#{schedule_id}")
122
+ b.FRBRthis(value: "#{MANIFESTATION_URI}/!#{schedule_id}")
119
123
  b.FRBRuri(value: MANIFESTATION_URI)
120
124
  b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
121
125
  b.FRBRauthor(href: '#slaw')
@@ -124,14 +128,7 @@ module Slaw
124
128
  }
125
129
 
126
130
  b.mainBody { |b|
127
- idprefix = "#{schedule_id}."
128
-
129
- # there is no good AKN hierarchy container for schedules, so we
130
- # use hcontainer instead
131
- b.hcontainer(id: schedule_id, name: "schedule") { |b|
132
- schedule_title.to_xml(b, idprefix, heading_text)
133
- body.children.elements.each_with_index { |e| e.to_xml(b, idprefix, i) } if body.is_a? Body
134
- }
131
+ body.children.elements.each_with_index { |e| e.to_xml(b, '', i) } if body.is_a? Body
135
132
  }
136
133
  }
137
134
  }
@@ -3,7 +3,9 @@ module Slaw
3
3
  module Tables
4
4
  class Table < Treetop::Runtime::SyntaxNode
5
5
  def to_xml(b, idprefix, i=0)
6
- b.table(id: "#{idprefix}table#{i}") { |b|
6
+ cnt = Slaw::Grammars::Counters.counters[idprefix]['table'] += 1
7
+
8
+ b.table(eId: "#{idprefix}table_#{cnt}") { |b|
7
9
  # we'll gather cells into this row list
8
10
  rows = []
9
11
  cells = []
@@ -12,13 +12,13 @@ module Slaw
12
12
  MANIFESTATION_URI = EXPRESSION_URI
13
13
 
14
14
  def to_xml(b, idprefix=nil, i=0)
15
- b.act(contains: "originalVersion") { |b|
15
+ b.act(contains: 'originalVersion', name: 'act') { |b|
16
16
  write_meta(b)
17
17
  write_preface(b)
18
18
  write_preamble(b)
19
19
  write_body(b)
20
+ write_schedules(b)
20
21
  }
21
- write_schedules(b)
22
22
  end
23
23
 
24
24
  def write_meta(b)
@@ -26,8 +26,8 @@ module Slaw
26
26
  write_identification(b)
27
27
 
28
28
  b.references(source: "#this") {
29
- b.TLCOrganization(id: 'slaw', href: 'https://github.com/longhotsummer/slaw', showAs: "Slaw")
30
- b.TLCOrganization(id: 'council', href: '/ontology/organization/za/council', showAs: "Council")
29
+ b.TLCOrganization(eId: 'slaw', href: 'https://github.com/longhotsummer/slaw', showAs: "Slaw")
30
+ b.TLCOrganization(eId: 'council', href: '/ontology/organization/za/council', showAs: "Council")
31
31
  }
32
32
  }
33
33
  end
@@ -38,7 +38,7 @@ module Slaw
38
38
  b.FRBRWork { |b|
39
39
  b.FRBRthis(value: "#{WORK_URI}/main")
40
40
  b.FRBRuri(value: WORK_URI)
41
- b.FRBRalias(value: 'Short Title')
41
+ b.FRBRalias(value: 'Short Title', name: 'title')
42
42
  b.FRBRdate(date: '1980-01-01', name: 'Generation')
43
43
  b.FRBRauthor(href: '#council')
44
44
  b.FRBRcountry(value: 'za')
@@ -125,7 +125,7 @@ module Slaw
125
125
  if !stmts.empty?
126
126
  b.preamble { |b|
127
127
  stmts.each { |e|
128
- e.preamble_statement.to_xml(b, "")
128
+ e.preamble_statement.to_xml(b, "preamble__")
129
129
  }
130
130
  }
131
131
  end
@@ -138,11 +138,11 @@ module Slaw
138
138
  end
139
139
 
140
140
  def to_xml(b, id_prefix='', *args)
141
- id = id_prefix + "part-#{num}"
141
+ id = id_prefix + "part_#{Slaw::Grammars::Counters.clean(num)}"
142
142
 
143
- b.part(id: id) { |b|
143
+ b.part(eId: id) { |b|
144
144
  heading.to_xml(b)
145
- children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
145
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '__', i) }
146
146
  }
147
147
  end
148
148
  end
@@ -171,13 +171,15 @@ module Slaw
171
171
  num = self.num
172
172
  if num.empty?
173
173
  num = Slaw::Grammars::Counters.counters[id_prefix]['subpart'] += 1
174
+ else
175
+ num = Slaw::Grammars::Counters.clean(num)
174
176
  end
175
177
 
176
- id = id_prefix + "subpart-#{num}"
178
+ id = id_prefix + "subpart_#{num}"
177
179
 
178
- b.subpart(id: id) { |b|
180
+ b.subpart(eId: id) { |b|
179
181
  heading.to_xml(b)
180
- children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
182
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '__', i) }
181
183
  }
182
184
  end
183
185
  end
@@ -203,11 +205,11 @@ module Slaw
203
205
  end
204
206
 
205
207
  def to_xml(b, id_prefix='', *args)
206
- id = id_prefix + "chapter-#{num}"
208
+ id = id_prefix + "chp_#{Slaw::Grammars::Counters.clean(num)}"
207
209
 
208
- b.chapter(id: id) { |b|
210
+ b.chapter(eId: id) { |b|
209
211
  heading.to_xml(b)
210
- children.elements.each_with_index { |e, i| e.to_xml(b, id + '.', i) }
212
+ children.elements.each_with_index { |e, i| e.to_xml(b, id + '__', i) }
211
213
  }
212
214
  end
213
215
  end
@@ -233,11 +235,11 @@ module Slaw
233
235
  end
234
236
 
235
237
  def to_xml(b, *args)
236
- id = "section-#{num}"
237
- b.section(id: id) { |b|
238
+ id = "sec_#{Slaw::Grammars::Counters.clean(num)}"
239
+ b.section(eId: id) { |b|
238
240
  section_title.to_xml(b)
239
241
 
240
- idprefix = "#{id}."
242
+ idprefix = "#{id}__"
241
243
  children.elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
242
244
  }
243
245
  end
@@ -308,11 +310,11 @@ module Slaw
308
310
 
309
311
  class BlockElements < Treetop::Runtime::SyntaxNode
310
312
  def to_xml(b, idprefix='', i=0)
311
- cnt = Slaw::Grammars::Counters.counters[idprefix]['paragraph'] += 1
312
- id = "#{idprefix}paragraph#{cnt}"
313
- idprefix = "#{id}."
313
+ cnt = Slaw::Grammars::Counters.counters[idprefix]['hcontainer'] += 1
314
+ id = "#{idprefix}hcontainer_#{cnt}"
315
+ idprefix = "#{id}__"
314
316
 
315
- b.paragraph(id: id) { |b|
317
+ b.hcontainer(eId: id, name: 'hcontainer') { |b|
316
318
  b.content { |b|
317
319
  elements.each_with_index { |e, i| e.to_xml(b, idprefix, i) }
318
320
  }
@@ -326,10 +328,10 @@ module Slaw
326
328
  end
327
329
 
328
330
  def to_xml(b, idprefix, i)
329
- id = idprefix + num.gsub(/[()]/, '')
330
- idprefix = id + "."
331
+ id = idprefix + "subsec_" + Slaw::Grammars::Counters.clean(num)
332
+ idprefix = id + "__"
331
333
 
332
- b.subsection(id: id) { |b|
334
+ b.subsection(eId: id) { |b|
333
335
  b.num(num)
334
336
  block_elements_with_inline.to_xml(b, idprefix)
335
337
  }
@@ -341,10 +343,10 @@ module Slaw
341
343
  # yield to it a builder to insert a listIntroduction node
342
344
  def to_xml(b, idprefix, i=0, &block)
343
345
  cnt = Slaw::Grammars::Counters.counters[idprefix]['list'] += 1
344
- id = idprefix + "list#{cnt}"
345
- idprefix = id + '.'
346
+ id = idprefix + "list_#{cnt}"
347
+ idprefix = id + '__'
346
348
 
347
- b.blockList(id: id, renest: true) { |b|
349
+ b.blockList(eId: id, renest: true) { |b|
348
350
  b.listIntroduction { |b| yield b } if block_given?
349
351
 
350
352
  elements.each { |e| e.to_xml(b, idprefix) }
@@ -358,7 +360,7 @@ module Slaw
358
360
  end
359
361
 
360
362
  def to_xml(b, idprefix)
361
- b.item(id: idprefix + num.gsub(/[()]/, '')) { |b|
363
+ b.item(eId: idprefix + "item_" + Slaw::Grammars::Counters.clean(num)) { |b|
362
364
  b.num(num)
363
365
  b.p { |b|
364
366
  item_content.inline_items.to_xml(b, idprefix) if respond_to? :item_content and item_content.respond_to? :inline_items
@@ -369,10 +371,10 @@ module Slaw
369
371
 
370
372
  class Crossheading < Treetop::Runtime::SyntaxNode
371
373
  def to_xml(b, idprefix, i=0)
372
- cnt = Slaw::Grammars::Counters.counters[idprefix]['crossheading'] += 1
373
- id = "#{idprefix}crossheading-#{cnt}"
374
+ cnt = Slaw::Grammars::Counters.counters[idprefix]['hcontainer'] += 1
375
+ id = "#{idprefix}hcontainer_#{cnt}"
374
376
 
375
- b.hcontainer(id: id, name: 'crossheading') { |b|
377
+ b.hcontainer(eId: id, name: 'crossheading') { |b|
376
378
  b.heading { |b|
377
379
  inline_items.to_xml(b, idprefix)
378
380
  }