slaw 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -0
- data/bin/slaw +4 -0
- data/lib/slaw/extract/extractor.rb +3 -19
- data/lib/slaw/generator.rb +11 -3
- data/lib/slaw/parse/builder.rb +23 -8
- data/lib/slaw/version.rb +1 -1
- data/spec/extract/extractor_spec.rb +1 -1
- data/spec/parse/builder_spec.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26b655c478bd057b9c89b7a7972984e387f9c447
|
4
|
+
data.tar.gz: 9c352182961b3c6a8dc6676c2de43e71777dc7ba
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1d882e1bbe07aa2ef08fcbeb5d82eb25aa4caede00b876a342ca050f89f4c184699c65e9a47787ceed1750644d9c634cb1281a0e070fad6f933bdca38ee6c23
|
7
|
+
data.tar.gz: 823fee99147547c3683685f443b6bd72b755fe840013059fa0ffecd02fae3a0bc3d0ccfcb66ecc6f419ba0aa2dbe42b44c27738f18db111232723a95fc169107
|
data/README.md
CHANGED
@@ -216,6 +216,15 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
|
|
216
216
|
|
217
217
|
## Changelog
|
218
218
|
|
219
|
+
### 0.8.0
|
220
|
+
|
221
|
+
* FEATURE: parse command only reformats input for PDFs or when --reformat is given
|
222
|
+
* FIX: don't error on defn tags without link to defined term
|
223
|
+
|
224
|
+
### 0.7.4
|
225
|
+
|
226
|
+
* use refersTo to identify blocks containing term definitions, rather than setting an (invalid) ID
|
227
|
+
|
219
228
|
### 0.7.3
|
220
229
|
|
221
230
|
* add link-definitions command to find and extract defined terms and link them to their definitions
|
data/bin/slaw
CHANGED
@@ -21,6 +21,7 @@ class SlawCLI < Thor
|
|
21
21
|
option :fragment, type: :string, desc: "Akoma Ntoso element name that the imported text represents. Support depends on the grammar."
|
22
22
|
option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
|
23
23
|
option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
|
24
|
+
option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
|
24
25
|
def parse(name)
|
25
26
|
logging
|
26
27
|
|
@@ -34,6 +35,7 @@ class SlawCLI < Thor
|
|
34
35
|
case options[:input]
|
35
36
|
when 'pdf'
|
36
37
|
text = extractor.extract_from_pdf(name)
|
38
|
+
options[:reformat] = true if options[:reformat].nil?
|
37
39
|
when 'text'
|
38
40
|
text = extractor.extract_from_text(name)
|
39
41
|
else
|
@@ -42,6 +44,8 @@ class SlawCLI < Thor
|
|
42
44
|
|
43
45
|
generator = Slaw::ActGenerator.new
|
44
46
|
|
47
|
+
text = generator.reformat(text) if options[:reformat]
|
48
|
+
|
45
49
|
if options[:fragment]
|
46
50
|
generator.document_class = Slaw::Fragment
|
47
51
|
|
@@ -17,14 +17,7 @@ module Slaw
|
|
17
17
|
|
18
18
|
@@pdftotext_path = "pdftotext"
|
19
19
|
|
20
|
-
#
|
21
|
-
attr_accessor :cleanser
|
22
|
-
|
23
|
-
def initialize
|
24
|
-
@cleanser = Slaw::Parse::Cleanser.new
|
25
|
-
end
|
26
|
-
|
27
|
-
# Extract text from a file and run cleanup on it.
|
20
|
+
# Extract text from a file.
|
28
21
|
#
|
29
22
|
# @param filename [String] filename to extract from
|
30
23
|
#
|
@@ -61,7 +54,7 @@ module Slaw
|
|
61
54
|
|
62
55
|
case status.exitstatus
|
63
56
|
when 0
|
64
|
-
return
|
57
|
+
return stdout
|
65
58
|
when 3
|
66
59
|
return nil if retried
|
67
60
|
retried = true
|
@@ -82,7 +75,7 @@ module Slaw
|
|
82
75
|
end
|
83
76
|
|
84
77
|
def extract_from_text(filename)
|
85
|
-
|
78
|
+
File.read(filename)
|
86
79
|
end
|
87
80
|
|
88
81
|
# Extract text from +filename+ by sending it to apache tika
|
@@ -99,15 +92,6 @@ module Slaw
|
|
99
92
|
text
|
100
93
|
end
|
101
94
|
|
102
|
-
# Run general once-off cleanup of extracted text.
|
103
|
-
def cleanup(text)
|
104
|
-
text = @cleanser.cleanup(text)
|
105
|
-
text = @cleanser.remove_empty_lines(text)
|
106
|
-
text = @cleanser.reformat(text)
|
107
|
-
|
108
|
-
text
|
109
|
-
end
|
110
|
-
|
111
95
|
def remove_pdf_password(filename)
|
112
96
|
file = Tempfile.new('steno')
|
113
97
|
begin
|
data/lib/slaw/generator.rb
CHANGED
@@ -30,10 +30,18 @@ module Slaw
|
|
30
30
|
act
|
31
31
|
end
|
32
32
|
|
33
|
+
# Run basic cleanup on text, such as ensuring clean newlines
|
34
|
+
# and removing tabs. This is always automatically done before
|
35
|
+
# processing.
|
33
36
|
def cleanup(text)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
+
@cleanser.cleanup(text)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Reformat some common errors in text to help make parsing more
|
41
|
+
# successful. Option and only recommended when processing a document
|
42
|
+
# for the first time.
|
43
|
+
def reformat(text)
|
44
|
+
@cleanser.reformat(text)
|
37
45
|
end
|
38
46
|
|
39
47
|
# Try to determine if section numbers come after titles,
|
data/lib/slaw/parse/builder.rb
CHANGED
@@ -222,16 +222,30 @@ module Slaw
|
|
222
222
|
terms = {}
|
223
223
|
doc.xpath('//a:def', a: NS).each do |defn|
|
224
224
|
# <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
|
225
|
-
|
226
|
-
|
227
|
-
|
225
|
+
if defn['refersTo']
|
226
|
+
id = defn['refersTo'].sub(/^#/, '')
|
227
|
+
term = defn.content
|
228
|
+
terms[id] = term
|
228
229
|
|
229
|
-
|
230
|
+
logger.info("+ Found definition for: #{term}")
|
231
|
+
end
|
230
232
|
end
|
231
233
|
|
232
234
|
terms
|
233
235
|
end
|
234
236
|
|
237
|
+
# Find defined terms in the document.
|
238
|
+
#
|
239
|
+
# This looks for heading elements with the words 'definitions' or 'interpretation',
|
240
|
+
# and then looks for phrases like
|
241
|
+
#
|
242
|
+
# "this word" means something...
|
243
|
+
#
|
244
|
+
# It identifies "this word" as a defined term and wraps it in a def tag with a refersTo
|
245
|
+
# attribute referencing the term being defined. The surrounding block
|
246
|
+
# structure is also has its refersTo attribute set to the term. This way, the term
|
247
|
+
# is both marked as defined, and the container element with the full
|
248
|
+
# definition of the term is identified.
|
235
249
|
def guess_at_definitions(doc)
|
236
250
|
doc.xpath('//a:section', a: NS).select do |section|
|
237
251
|
# sections with headings like Definitions
|
@@ -254,16 +268,17 @@ module Slaw
|
|
254
268
|
term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
|
255
269
|
|
256
270
|
# <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
|
257
|
-
|
271
|
+
refersTo = "##{term_id}"
|
272
|
+
defn = doc.create_element('def', term, refersTo: refersTo)
|
258
273
|
rest = match.post_match
|
259
274
|
|
260
275
|
text.before(defn)
|
261
276
|
defn.before(doc.create_text_node('"'))
|
262
277
|
text.content = '"' + rest
|
263
278
|
|
264
|
-
# adjust the container's
|
265
|
-
parent = find_up(container, ['
|
266
|
-
parent['
|
279
|
+
# adjust the container's refersTo attribute
|
280
|
+
parent = find_up(container, ['item', 'point', 'blockList', 'list', 'paragraph', 'subsection', 'section', 'chapter', 'part'])
|
281
|
+
parent['refersTo'] = refersTo
|
267
282
|
end
|
268
283
|
end
|
269
284
|
end
|
data/lib/slaw/version.rb
CHANGED
data/spec/parse/builder_spec.rb
CHANGED
@@ -617,14 +617,14 @@ XML
|
|
617
617
|
subject.guess_at_definitions(doc)
|
618
618
|
doc.to_s.should == section(<<XML
|
619
619
|
<heading>Definitions</heading>
|
620
|
-
<subsection id="
|
620
|
+
<subsection id="section-1.subsection-1" refersTo="#term-authorised_official">
|
621
621
|
<content>
|
622
622
|
<p>"<def refersTo="#term-authorised_official">authorised official</def>" means any official of the Council who has been authorised by it to administer, implement and enforce the provisions of these By-laws;</p>
|
623
623
|
</content>
|
624
624
|
</subsection>
|
625
625
|
<subsection id="section-1.subsection-2">
|
626
626
|
<content>
|
627
|
-
<blockList id="
|
627
|
+
<blockList id="section-1.subsection-2.list2" refersTo="#term-Council">
|
628
628
|
<listIntroduction>"<def refersTo="#term-Council">Council</def>" means – </listIntroduction>
|
629
629
|
<item id="section-1.subsection-2.list2.a">
|
630
630
|
<num>(a)</num>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|