slaw 10.2.0 → 10.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f76b66445a595e5130d1a3af8b153498d41007416bcd7c5ced418d4339030303
4
- data.tar.gz: cd28f4839c8ecd4b430a6111a3c5ec3df764af84674b1e6205d54ac27b3c734f
3
+ metadata.gz: 2feb9ea5e726f4300a0920de90b88f14b3a86ce4ae3dddd9a59b3bf8c855e575
4
+ data.tar.gz: 263ef515bc1a81b8cccc1788a3b107b573f3840492913cedf17ae729745d1c4b
5
5
  SHA512:
6
- metadata.gz: f04d34041c3dc21f552b81af8d331d4440cd7c36d94e38007d15be8b467cca15b2bedbc1b5d1fb7d652ebd2b263f695a02afd337ce20df5c0e5eca7b1ec02050
7
- data.tar.gz: dae3ef8b12f13761543a90b1711b301af9a3368cbde2387c8c00adf717c6a297ddfcfe108b9ef53073b370178574877a978fb8968d63317ed736ca1c49417232
6
+ metadata.gz: 573bab26b9e880856e74404058208e9fae5f400f0ee64413d771fea872e4b3b7c626ebefb5c020321dc4b4f35919b25b94801c6622e82d2f2a8dfe3507c35f61
7
+ data.tar.gz: 01d73bb0903dbee8213eae364316835361b4d34bbe0ed4a12f9dcaeb114bf1e5ec9ac46133e0570bea1998af0daaac610591da905463d64e63f1449ffd529db7
data/.travis.yml CHANGED
@@ -2,6 +2,5 @@ language: ruby
2
2
  rvm:
3
3
  - 2.7.0
4
4
  - 2.6.2
5
- - 2.5.4
6
5
  before_install:
7
6
  - gem update bundler
data/README.md CHANGED
@@ -86,6 +86,26 @@ You can create your own grammar by creating a gem that provides these files and
86
86
 
87
87
  ## Changelog
88
88
 
89
+ ### 10.6.0 (10 May 2021)
90
+
91
+ * Handle sup and sub when extracting from HTML.
92
+
93
+ ### 10.5.0 (20 April 2021)
94
+
95
+ * Handle escaping inlines when unparsing.
96
+
97
+ ### 10.4.1 (14 April 2021)
98
+
99
+ * Handle escaping in inlines, so that forward slashes in link text are unescaped correctly, eg `[https:\/\/example.com](https://example.com)`
100
+
101
+ ### 10.4.0 (9 April 2021)
102
+
103
+ * Remove dependency on mimemagic. Guess file type based on filename instead.
104
+
105
+ ### 10.3.1 (11 January 2021)
106
+
107
+ * Strip ascii, unicode general and unicode supplemental punctuation from num elements when building eIds
108
+
89
109
  ### 10.2.0 (4 September 2020)
90
110
 
91
111
  * support inline superscript `^^text^^`
@@ -1,5 +1,3 @@
1
- require 'mimemagic'
2
-
3
1
  module Slaw
4
2
  module Extract
5
3
 
@@ -13,15 +11,10 @@ module Slaw
13
11
  #
14
12
  # @return [String] extracted text
15
13
  def extract_from_file(filename)
16
- mimetype = get_mimetype(filename)
17
-
18
- case mimetype && mimetype.type
19
- when 'text/html'
14
+ if filename.end_with? '.html' or filename.end_with? '.htm'
20
15
  extract_from_html(filename)
21
- when 'text/plain', nil
22
- extract_from_text(filename)
23
16
  else
24
- raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
17
+ extract_from_text(filename)
25
18
  end
26
19
  end
27
20
 
@@ -11,9 +11,10 @@
11
11
 
12
12
  <xsl:template match="head|style|script|link" />
13
13
 
14
- <xsl:template match="ul|ol">
14
+ <!-- block containers that end with newlines -->
15
+ <xsl:template match="ul|ol|section|article|h1|h2|h3|h4|h5">
15
16
  <xsl:apply-templates />
16
- <xsl:text>&#10;</xsl:text>
17
+ <xsl:text>&#10;&#10;</xsl:text>
17
18
  </xsl:template>
18
19
 
19
20
  <xsl:template match="ul/li">
@@ -23,20 +24,23 @@
23
24
  <xsl:text>&#10;</xsl:text>
24
25
  </xsl:template>
25
26
 
27
+ <!-- numbered lists should include a number -->
26
28
  <xsl:template match="ol/li">
27
- <!-- 1. foo -->
29
+ <!-- \1. foo -->
28
30
  <xsl:text>\</xsl:text>
29
- <xsl:value-of select="position()" />
31
+ <xsl:choose>
32
+ <xsl:when test="@value">
33
+ <xsl:value-of select="@value" />
34
+ </xsl:when>
35
+ <xsl:otherwise>
36
+ <xsl:value-of select="position()" />
37
+ </xsl:otherwise>
38
+ </xsl:choose>
30
39
  <xsl:text>. </xsl:text>
31
40
  <xsl:apply-templates />
32
41
  <xsl:text>&#10;</xsl:text>
33
42
  </xsl:template>
34
43
 
35
- <xsl:template match="h1|h2|h3|h4|h5">
36
- <xsl:apply-templates />
37
- <xsl:text>&#10;&#10;</xsl:text>
38
- </xsl:template>
39
-
40
44
  <xsl:template match="p|div">
41
45
  <xsl:choose>
42
46
  <xsl:when test="starts-with(., '[') and substring(., string-length(.)) = ']'">
@@ -51,32 +55,27 @@
51
55
  <xsl:text>&#10;&#10;</xsl:text>
52
56
  </xsl:template>
53
57
 
58
+ <!-- START tables -->
59
+
54
60
  <xsl:template match="table">
55
61
  <xsl:text>{| </xsl:text>
56
- <xsl:text>
57
- |-</xsl:text>
62
+ <xsl:text>&#10;|-</xsl:text>
58
63
  <xsl:apply-templates />
59
- <xsl:text>
60
- |}
61
-
62
- </xsl:text>
64
+ <xsl:text>&#10;|}&#10;&#10;</xsl:text>
63
65
  </xsl:template>
64
66
 
65
67
  <xsl:template match="tr">
66
68
  <xsl:apply-templates />
67
- <xsl:text>
68
- |-</xsl:text>
69
+ <xsl:text>&#10;|-</xsl:text>
69
70
  </xsl:template>
70
71
 
71
72
  <xsl:template match="th|td">
72
73
  <xsl:choose>
73
74
  <xsl:when test="local-name(.) = 'th'">
74
- <xsl:text>
75
- ! </xsl:text>
75
+ <xsl:text>&#10;! </xsl:text>
76
76
  </xsl:when>
77
77
  <xsl:when test="local-name(.) = 'td'">
78
- <xsl:text>
79
- | </xsl:text>
78
+ <xsl:text>&#10;| </xsl:text>
80
79
  </xsl:when>
81
80
  </xsl:choose>
82
81
 
@@ -118,8 +117,15 @@
118
117
  </xsl:template>
119
118
 
120
119
  <xsl:template match="br">
121
- <xsl:text>
122
- </xsl:text>
120
+ <xsl:text>&#10;</xsl:text>
121
+ </xsl:template>
122
+
123
+ <xsl:template match="sup">
124
+ <xsl:text>^^</xsl:text><xsl:apply-templates /><xsl:text>^^</xsl:text>
125
+ </xsl:template>
126
+
127
+ <xsl:template match="sub">
128
+ <xsl:text>_^</xsl:text><xsl:apply-templates /><xsl:text>^_</xsl:text>
123
129
  </xsl:template>
124
130
 
125
131
 
@@ -24,20 +24,32 @@ module Slaw
24
24
  # Clean a <num> value for use in an eId
25
25
  # See https://docs.oasis-open.org/legaldocml/akn-nc/v1.0/os/akn-nc-v1.0-os.html#_Toc531692306
26
26
  #
27
- # The number part of the identifiers of such elements corresponds to the
27
+ # "The number part of the identifiers of such elements corresponds to the
28
28
  # stripping of all final punctuation, meaningless separations as well as
29
29
  # redundant characters in the content of the <num> element. The
30
- # representation is case-sensitive
30
+ # representation is case-sensitive."
31
+ #
32
+ # Our algorithm is:
33
+ # 1. strip all leading and trailing whitespace and punctuation (using the unicode punctuation blocks)
34
+ # 2. strip all whitespace
35
+ # 3. replace all remaining punctuation with a hyphen.
36
+ #
37
+ # The General Punctuation block is \u2000-\u206F, and the Supplemental Punctuation block is \u2E00-\u2E7F.
31
38
  #
32
39
  # (i) -> i
33
40
  # 1.2. -> 1-2
41
+ # “2.3“ -> 2-3
34
42
  # 3a bis -> 3abis
35
43
  def self.clean(num)
44
+ # leading whitespace and punctuation
45
+ num = num.gsub(/^[\s\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+/, '')
46
+ # trailing whitespace and punctuation
47
+ num.gsub!(/[\s\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+$/, '')
48
+ # whitespace
49
+ num.gsub!(/\s/, '')
50
+ # remaining punctuation to a hyphen
51
+ num.gsub!(/[\u{2000}-\u{206f}\u{2e00}-\u{2e7f}!"#$%&'()*+,\-.\/:;<=>?@\[\]^_`{|}~]+/, '-')
36
52
  num
37
- .gsub(/[ ()\[\]]/, '')
38
- .gsub(/\.+$/, '')
39
- .gsub(/^\.+/, '')
40
- .gsub(/\.+/, '-')
41
53
  end
42
54
  end
43
55
  end
@@ -20,7 +20,7 @@ module Slaw
20
20
  end
21
21
 
22
22
  rule inline_item
23
- remark / image / ref / bold / italics / superscript / subscript / [^\n]
23
+ remark / image / ref / bold / italics / superscript / subscript / '\\'? [^\n]
24
24
  <InlineItem>
25
25
  end
26
26
 
@@ -37,7 +37,12 @@ module Slaw
37
37
 
38
38
  class InlineItem < Treetop::Runtime::SyntaxNode
39
39
  def to_xml(b, idprefix)
40
- b.text(text_value)
40
+ if text_value.start_with? '\\'
41
+ # handle escaped characters: \a -> a
42
+ b.text(text_value[1..])
43
+ else
44
+ b.text(text_value)
45
+ end
41
46
  end
42
47
  end
43
48
 
@@ -9,31 +9,141 @@
9
9
  <xsl:strip-space elements="*"/>
10
10
  <xsl:preserve-space elements="a:a a:affectedDocument a:b a:block a:caption a:change a:concept a:courtType a:date a:def a:del a:docCommittee a:docDate a:docIntroducer a:docJurisdiction a:docNumber a:docProponent a:docPurpose a:docStage a:docStatus a:docTitle a:docType a:docketNumber a:entity a:event a:extractText a:fillIn a:from a:heading a:i a:inline a:ins a:judge a:lawyer a:legislature a:li a:listConclusion a:listIntroduction a:location a:mmod a:mod a:mref a:narrative a:neutralCitation a:num a:object a:omissis a:opinion a:organization a:outcome a:p a:party a:person a:placeholder a:process a:quantity a:quotedText a:recordedTime a:ref a:relatedDocument a:remark a:rmod a:role a:rref a:scene a:session a:shortTitle a:signature a:span a:sub a:subheading a:summary a:sup a:term a:tocItem a:u a:vote"/>
11
11
 
12
+ <!-- replaces "value" in "text" with "replacement" -->
13
+ <xsl:template name="string-replace-all">
14
+ <xsl:param name="text" />
15
+ <xsl:param name="value" />
16
+ <xsl:param name="replacement" />
17
+
18
+ <xsl:choose>
19
+ <xsl:when test="$text = '' or $value = '' or not($value)">
20
+ <xsl:value-of select="$text" />
21
+ </xsl:when>
22
+ <xsl:when test="contains($text, $value)">
23
+ <xsl:value-of select="substring-before($text, $value)"/>
24
+ <xsl:value-of select="$replacement" />
25
+ <xsl:call-template name="string-replace-all">
26
+ <xsl:with-param name="text" select="substring-after($text, $value)" />
27
+ <xsl:with-param name="value" select="$value" />
28
+ <xsl:with-param name="replacement" select="$replacement" />
29
+ </xsl:call-template>
30
+ </xsl:when>
31
+ <xsl:otherwise>
32
+ <xsl:value-of select="$text" />
33
+ </xsl:otherwise>
34
+ </xsl:choose>
35
+ </xsl:template>
36
+
37
+ <!-- Escape inline markers with a backslash -->
38
+ <xsl:template name="escape-inlines">
39
+ <xsl:param name="text" />
40
+
41
+ <!-- This works from the inside out, first escaping backslash chars themselves, then escaping
42
+ the different types of inline markers -->
43
+ <xsl:call-template name="string-replace-all">
44
+ <xsl:with-param name="text">
45
+ <xsl:call-template name="string-replace-all">
46
+ <xsl:with-param name="text">
47
+ <xsl:call-template name="string-replace-all">
48
+ <xsl:with-param name="text">
49
+ <xsl:call-template name="string-replace-all">
50
+ <xsl:with-param name="text">
51
+ <xsl:call-template name="string-replace-all">
52
+ <xsl:with-param name="text">
53
+ <xsl:call-template name="string-replace-all">
54
+ <xsl:with-param name="text">
55
+ <xsl:call-template name="string-replace-all">
56
+ <xsl:with-param name="text">
57
+ <xsl:call-template name="string-replace-all">
58
+ <xsl:with-param name="text">
59
+ <xsl:call-template name="string-replace-all">
60
+ <xsl:with-param name="text">
61
+ <xsl:call-template name="string-replace-all">
62
+ <xsl:with-param name="text" select="$text" />
63
+ <xsl:with-param name="value"><xsl:value-of select="'\'" /></xsl:with-param>
64
+ <xsl:with-param name="replacement"><xsl:value-of select="'\\'" /></xsl:with-param>
65
+ </xsl:call-template>
66
+ </xsl:with-param>
67
+ <xsl:with-param name="value"><xsl:value-of select="'**'" /></xsl:with-param>
68
+ <xsl:with-param name="replacement"><xsl:value-of select="'\*\*'" /></xsl:with-param>
69
+ </xsl:call-template>
70
+ </xsl:with-param>
71
+ <xsl:with-param name="value"><xsl:value-of select="'//'" /></xsl:with-param>
72
+ <xsl:with-param name="replacement"><xsl:value-of select="'\/\/'" /></xsl:with-param>
73
+ </xsl:call-template>
74
+ </xsl:with-param>
75
+ <xsl:with-param name="value"><xsl:value-of select="'_^'" /></xsl:with-param>
76
+ <xsl:with-param name="replacement"><xsl:value-of select="'\_^'" /></xsl:with-param>
77
+ </xsl:call-template>
78
+ </xsl:with-param>
79
+ <xsl:with-param name="value"><xsl:value-of select="'^_'" /></xsl:with-param>
80
+ <xsl:with-param name="replacement"><xsl:value-of select="'\^_'" /></xsl:with-param>
81
+ </xsl:call-template>
82
+ </xsl:with-param>
83
+ <xsl:with-param name="value"><xsl:value-of select="'^^'" /></xsl:with-param>
84
+ <xsl:with-param name="replacement"><xsl:value-of select="'\^\^'" /></xsl:with-param>
85
+ </xsl:call-template>
86
+ </xsl:with-param>
87
+ <xsl:with-param name="value"><xsl:value-of select="'!['" /></xsl:with-param>
88
+ <xsl:with-param name="replacement"><xsl:value-of select="'\!['" /></xsl:with-param>
89
+ </xsl:call-template>
90
+ </xsl:with-param>
91
+ <xsl:with-param name="value"><xsl:value-of select="']('" /></xsl:with-param>
92
+ <xsl:with-param name="replacement"><xsl:value-of select="'\]('" /></xsl:with-param>
93
+ </xsl:call-template>
94
+ </xsl:with-param>
95
+ <xsl:with-param name="value"><xsl:value-of select="'[['" /></xsl:with-param>
96
+ <xsl:with-param name="replacement"><xsl:value-of select="'\[\['" /></xsl:with-param>
97
+ </xsl:call-template>
98
+ </xsl:with-param>
99
+ <xsl:with-param name="value"><xsl:value-of select="']]'" /></xsl:with-param>
100
+ <xsl:with-param name="replacement"><xsl:value-of select="'\]\]'" /></xsl:with-param>
101
+ </xsl:call-template>
102
+ </xsl:template>
103
+
12
104
  <!-- adds a backslash to the start of the value param, if necessary -->
13
- <xsl:template name="escape">
105
+ <xsl:template name="escape-prefixes">
14
106
  <xsl:param name="value"/>
15
107
 
16
108
  <xsl:variable name="prefix" select="translate(substring($value, 1, 13), 'abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
17
109
  <!-- '(' is considered special, so translate numbers into '(' so we can find and escape them -->
18
110
  <xsl:variable name="numprefix" select="translate(substring($value, 1, 3), '1234567890', '((((((((((')" />
19
111
 
20
- <!-- p tags must escape initial content that looks like a block element marker -->
21
- <xsl:if test="$prefix = 'BODY' or
22
- $prefix = 'PREAMBLE' or
23
- $prefix = 'PREFACE' or
24
- starts-with($prefix, 'CHAPTER ') or
25
- starts-with($prefix, 'PART ') or
26
- starts-with($prefix, 'SUBPART ') or
27
- starts-with($prefix, 'SCHEDULE ') or
28
- starts-with($prefix, 'HEADING ') or
29
- starts-with($prefix, 'SUBHEADING ') or
30
- starts-with($prefix, 'LONGTITLE ') or
31
- starts-with($prefix, 'CROSSHEADING ') or
32
- starts-with($prefix, '{|') or
33
- starts-with($numprefix, '(')">
34
- <xsl:text>\</xsl:text>
35
- </xsl:if>
36
- <xsl:value-of select="$value"/>
112
+ <xsl:variable name="slash">
113
+ <!-- p tags must escape initial content that looks like a block element marker -->
114
+ <xsl:if test="$prefix = 'BODY' or
115
+ $prefix = 'PREAMBLE' or
116
+ $prefix = 'PREFACE' or
117
+ starts-with($prefix, 'CHAPTER ') or
118
+ starts-with($prefix, 'PART ') or
119
+ starts-with($prefix, 'SUBPART ') or
120
+ starts-with($prefix, 'SCHEDULE ') or
121
+ starts-with($prefix, 'HEADING ') or
122
+ starts-with($prefix, 'SUBHEADING ') or
123
+ starts-with($prefix, 'LONGTITLE ') or
124
+ starts-with($prefix, 'CROSSHEADING ') or
125
+ starts-with($prefix, '{|') or
126
+ starts-with($numprefix, '(')">
127
+ <xsl:value-of select="'\'" />
128
+ </xsl:if>
129
+ </xsl:variable>
130
+
131
+ <xsl:value-of select="concat($slash, $value)" />
132
+ </xsl:template>
133
+
134
+ <!-- adds a backslash to the start of the text param, if necessary -->
135
+ <xsl:template name="escape">
136
+ <xsl:param name="value"/>
137
+
138
+ <xsl:variable name="escaped">
139
+ <xsl:call-template name="escape-inlines">
140
+ <xsl:with-param name="text" select="$value" />
141
+ </xsl:call-template>
142
+ </xsl:variable>
143
+
144
+ <xsl:call-template name="escape-prefixes">
145
+ <xsl:with-param name="value" select="$escaped" />
146
+ </xsl:call-template>
37
147
  </xsl:template>
38
148
 
39
149
  <xsl:template match="a:act">
@@ -157,12 +267,19 @@
157
267
  </xsl:template>
158
268
 
159
269
  <!-- first text nodes of these elems must be escaped if they have special chars -->
160
- <xsl:template match="a:p[not(ancestor::a:table)]/text()[1] | a:listIntroduction/text()[1] | a:intro/text()[1]">
270
+ <xsl:template match="a:p[not(ancestor::a:table)]/text()[not(preceding-sibling::*)] | a:listIntroduction/text()[not(preceding-sibling::*)] | a:intro/text()[not(preceding-sibling::*)]">
161
271
  <xsl:call-template name="escape">
162
272
  <xsl:with-param name="value" select="." />
163
273
  </xsl:call-template>
164
274
  </xsl:template>
165
275
 
276
+ <!-- escape inlines in text nodes -->
277
+ <xsl:template match="text()">
278
+ <xsl:call-template name="escape-inlines">
279
+ <xsl:with-param name="text" select="." />
280
+ </xsl:call-template>
281
+ </xsl:template>
282
+
166
283
 
167
284
  <!-- attachments/schedules -->
168
285
  <xsl:template match="a:attachment">
@@ -176,7 +293,7 @@
176
293
  <xsl:text>&#10;</xsl:text>
177
294
  </xsl:if>
178
295
 
179
- <xsl:text>&#10;&#10;</xsl:text>
296
+ <xsl:text>&#10;</xsl:text>
180
297
  <xsl:apply-templates select="a:doc/a:mainBody" />
181
298
  </xsl:template>
182
299
 
@@ -192,31 +309,24 @@
192
309
  <xsl:value-of select="." />
193
310
  <xsl:text>" </xsl:text>
194
311
  </xsl:for-each>
195
- <xsl:text>
196
- |-</xsl:text>
312
+ <xsl:text>&#10;|-</xsl:text>
197
313
 
198
314
  <xsl:apply-templates />
199
- <xsl:text>
200
- |}
201
-
202
- </xsl:text>
315
+ <xsl:text>&#10;|}&#10;&#10;</xsl:text>
203
316
  </xsl:template>
204
317
 
205
318
  <xsl:template match="a:tr">
206
319
  <xsl:apply-templates />
207
- <xsl:text>
208
- |-</xsl:text>
320
+ <xsl:text>&#10;|-</xsl:text>
209
321
  </xsl:template>
210
322
 
211
323
  <xsl:template match="a:th|a:td">
212
324
  <xsl:choose>
213
325
  <xsl:when test="local-name(.) = 'th'">
214
- <xsl:text>
215
- ! </xsl:text>
326
+ <xsl:text>&#10;! </xsl:text>
216
327
  </xsl:when>
217
328
  <xsl:when test="local-name(.) = 'td'">
218
- <xsl:text>
219
- | </xsl:text>
329
+ <xsl:text>&#10;| </xsl:text>
220
330
  </xsl:when>
221
331
  </xsl:choose>
222
332
 
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "10.2.0"
2
+ VERSION = "10.6.0"
3
3
  end
data/slaw.gemspec CHANGED
@@ -25,5 +25,4 @@ Gem::Specification.new do |spec|
25
25
  spec.add_runtime_dependency "treetop", "~> 1.5"
26
26
  spec.add_runtime_dependency "log4r", "~> 1.1"
27
27
  spec.add_runtime_dependency "thor", "~> 0.20"
28
- spec.add_runtime_dependency "mimemagic", "~> 0.2"
29
28
  end
@@ -0,0 +1,38 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'spec_helper'
4
+
5
+ require 'slaw'
6
+
7
+ describe Slaw::Grammars::Counters do
8
+ describe '#clean' do
9
+ it 'should remove leading and trailing punctuation' do
10
+ described_class.clean("").should == ""
11
+ described_class.clean(" ").should == ""
12
+ described_class.clean("( )").should == ""
13
+ described_class.clean("(123.4-5)").should == "123-4-5"
14
+ described_class.clean("(312.32.7)").should == "312-32-7"
15
+ described_class.clean("(312_32_7)").should == "312-32-7"
16
+ described_class.clean("(6)").should == "6"
17
+ described_class.clean("[16]").should == "16"
18
+ described_class.clean("(i)").should == "i"
19
+ described_class.clean("[i]").should == "i"
20
+ described_class.clean("(2bis)").should == "2bis"
21
+ described_class.clean('"1.2.').should == "1-2"
22
+ described_class.clean("1.2.").should == "1-2"
23
+ described_class.clean("“2.3").should == "2-3"
24
+ described_class.clean("2,3").should == "2-3"
25
+ described_class.clean("2,3, 4,").should == "2-3-4"
26
+ described_class.clean("3a bis").should == "3abis"
27
+ described_class.clean("3é").should == "3é"
28
+ described_class.clean(" -3a--4,9").should == "3a-4-9"
29
+ end
30
+
31
+ it 'should handle non-arabic numerals' do
32
+ # hebrew aleph
33
+ described_class.clean("(א)").should == "א"
34
+ # chinese 3
35
+ described_class.clean("(三)").should == "三"
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,20 @@
1
+ BODY
2
+
3
+ 1. Section that tests escapes
4
+
5
+ text \\ with a single slash
6
+
7
+ some **inlines \/\/ [with \/\/ slashes](#foo)**
8
+
9
+ inlines that \*\* should \/\/ be \[\[ escaped \![ and \]\]
10
+
11
+ refs [https:\/\/example.com with ] and and \]( and **nested \*\* stars \*\***](#foo)
12
+
13
+ nested ** stars \*\* in bold \*\***
14
+
15
+ nested // slashes \/\/ in italics \/\///
16
+
17
+ nested ** stars in // italics \*\* // and bold **
18
+
19
+ super ^^with \^\^ hats \^\^^^ and sub _^\_^ with \^_ end tokens \^_^_
20
+
@@ -78,45 +78,81 @@ XML
78
78
 
79
79
  1. Section
80
80
 
81
- \Chapter 2 ignored
81
+ \\Chapter 2 ignored
82
82
 
83
83
  Chapters
84
84
 
85
- \Part 2 ignored
85
+ \\Part 2 ignored
86
86
 
87
87
  participation
88
88
 
89
- \Schedule 2 ignored
89
+ \\Schedule 2 ignored
90
90
 
91
91
  Schedules
92
92
 
93
- \HEADING x
93
+ \\HEADING x
94
94
 
95
- \SUBHEADING x
95
+ \\SUBHEADING x
96
96
 
97
97
  BODY not escaped
98
98
 
99
- \BODY
99
+ \\BODY
100
100
 
101
101
  PREAMBLE not escaped
102
102
 
103
- \PREAMBLE
103
+ \\PREAMBLE
104
104
 
105
105
  PREFACE not escaped
106
106
 
107
- \PREFACE
107
+ \\PREFACE
108
108
 
109
- \2. ignored
109
+ \\2. ignored
110
110
 
111
- \2.1 ignored
111
+ \\2.1 ignored
112
112
 
113
- \(2) ignored
113
+ \\(2) ignored
114
114
 
115
- \(a) ignored
115
+ \\(a) ignored
116
116
 
117
- \(2a) ignored
117
+ \\(2a) ignored
118
118
 
119
- \{| ignored
119
+ \\{| ignored
120
+
121
+ '
122
+ end
123
+
124
+ it 'should escape inlines when unparsing' do
125
+ doc = xml2doc(section(<<'XML'
126
+ <num>1.</num>
127
+ <heading>Section</heading>
128
+ <paragraph id="section-1.paragraph-0">
129
+ <content>
130
+ <p>text \ with a single slash</p>
131
+ <p>some <b>inlines // <ref href="#foo">with // slashes</ref></b></p>
132
+ <p>inlines that ** should // be [[ escaped ![ and ]]</p>
133
+ <p>refs <ref href="#foo">https://example.com with ] and ]( and <b>nested **</b></ref></p>
134
+ <p>super <sup>with ^^</sup> and sub <sub>_^ with ^_</sub></p>
135
+ </content>
136
+ </paragraph>
137
+ XML
138
+ ))
139
+
140
+ text = subject.text_from_act(doc)
141
+ # NOTE: in single quoted strings, backslash sequences aren't considered special, EXCEPT a double backslash
142
+ # which is actually a single backslash. So \\ needs to be \\\\ while \* is just \*. The mind boggles.
143
+ text.should == 'BODY
144
+
145
+ 1. Section
146
+
147
+ text \\\\ with a single slash
148
+
149
+ some **inlines \/\/ [with \/\/ slashes](#foo)**
150
+
151
+ inlines that \*\* should \/\/ be \[\[ escaped \![ and \]\]
152
+
153
+ refs [https:\/\/example.com with ] and \]( and **nested \*\***](#foo)
154
+
155
+ super ^^with \^\^^^ and sub _^\_^ with \^_^_
120
156
 
121
157
  '
122
158
  end
@@ -148,7 +184,7 @@ XML
148
184
 
149
185
  1. Section
150
186
 
151
- \(2) A special meeting [[ foo ]]:
187
+ \\(2) A special meeting [[ foo ]]:
152
188
 
153
189
  (a) the chairperson so directs; or
154
190
 
@@ -269,4 +305,13 @@ Subject to approval in terms of this By-Law.
269
305
  '
270
306
  end
271
307
  end
308
+
309
+ describe 'round trip' do
310
+ it 'should be idempotent for escapes' do
311
+ text = File.open('spec/fixtures/roundtrip-escapes.txt', 'r').read()
312
+ act = subject.generate_from_text(text)
313
+ xml = act.to_xml(encoding: 'utf-8')
314
+ subject.text_from_act(act).should == text
315
+ end
316
+ end
272
317
  end
@@ -117,16 +117,19 @@ EOS
117
117
  it 'should handle escaped content' do
118
118
  node = parse :body, <<EOS
119
119
  \\1. ignored
120
+ foo \\\\bar
120
121
 
121
- \\CROSSHEADING crossheading
122
+ \\CROSSHEADING cross\\heading
122
123
 
123
- 1. Section
124
+ 1. Sec\\tion
124
125
  \\Chapter 2 ignored
126
+ Some text with a \\\\real backslash
125
127
  EOS
126
128
  to_xml(node).should == '<body>
127
129
  <hcontainer eId="hcontainer_1" name="hcontainer">
128
130
  <content>
129
131
  <p>1. ignored</p>
132
+ <p>foo \\bar</p>
130
133
  <p>CROSSHEADING crossheading</p>
131
134
  </content>
132
135
  </hcontainer>
@@ -136,6 +139,7 @@ EOS
136
139
  <hcontainer eId="sec_1__hcontainer_1" name="hcontainer">
137
140
  <content>
138
141
  <p>Chapter 2 ignored</p>
142
+ <p>Some text with a \\real backslash</p>
139
143
  </content>
140
144
  </hcontainer>
141
145
  </section>
@@ -325,6 +325,17 @@ EOS
325
325
  <p>This statement has <remark status="editorial">[<ref href="/foo/bar">a link in</ref> a remark]</remark></p>
326
326
  <p>This statement has <remark status="editorial">[a <ref href="/foo/bar">link in a remark</ref>]</remark></p>
327
327
  </content>
328
+ </hcontainer>'
329
+ end
330
+
331
+ it 'should handle escapes in links' do
332
+ node = parse :generic_container, <<EOS
333
+ Visit the site [https:\\/\\/example.com](https://example.com) for more.
334
+ EOS
335
+ to_xml(node, "").should == '<hcontainer eId="hcontainer_1" name="hcontainer">
336
+ <content>
337
+ <p>Visit the site <ref href="https://example.com">https://example.com</ref> for more.</p>
338
+ </content>
328
339
  </hcontainer>'
329
340
  end
330
341
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 10.2.0
4
+ version: 10.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-04 00:00:00.000000000 Z
11
+ date: 2021-05-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -94,20 +94,6 @@ dependencies:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0.20'
97
- - !ruby/object:Gem::Dependency
98
- name: mimemagic
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - "~>"
102
- - !ruby/object:Gem::Version
103
- version: '0.2'
104
- type: :runtime
105
- prerelease: false
106
- version_requirements: !ruby/object:Gem::Requirement
107
- requirements:
108
- - - "~>"
109
- - !ruby/object:Gem::Version
110
- version: '0.2'
111
97
  description: Slaw is a lightweight library for rendering and generating Akoma Ntoso
112
98
  acts from plain text and PDF documents.
113
99
  email:
@@ -155,8 +141,10 @@ files:
155
141
  - lib/slaw/version.rb
156
142
  - lib/slaw/xml_support.rb
157
143
  - slaw.gemspec
144
+ - spec/counters_spec.rb
158
145
  - spec/extract/extractor_spec.rb
159
146
  - spec/fixtures/community-fire-safety.xml
147
+ - spec/fixtures/roundtrip-escapes.txt
160
148
  - spec/generator_spec.rb
161
149
  - spec/parse/blocklists_spec.rb
162
150
  - spec/parse/builder_spec.rb
@@ -172,7 +160,7 @@ homepage: https://github.com/longhotsummer/slaw
172
160
  licenses:
173
161
  - MIT
174
162
  metadata: {}
175
- post_install_message:
163
+ post_install_message:
176
164
  rdoc_options: []
177
165
  require_paths:
178
166
  - lib
@@ -188,12 +176,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
188
176
  version: '0'
189
177
  requirements: []
190
178
  rubygems_version: 3.0.3
191
- signing_key:
179
+ signing_key:
192
180
  specification_version: 4
193
181
  summary: A lightweight library for using Akoma Ntoso acts in Ruby.
194
182
  test_files:
183
+ - spec/counters_spec.rb
195
184
  - spec/extract/extractor_spec.rb
196
185
  - spec/fixtures/community-fire-safety.xml
186
+ - spec/fixtures/roundtrip-escapes.txt
197
187
  - spec/generator_spec.rb
198
188
  - spec/parse/blocklists_spec.rb
199
189
  - spec/parse/builder_spec.rb