slaw 0.7.3 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +9 -0
- data/bin/slaw +4 -0
- data/lib/slaw/extract/extractor.rb +3 -19
- data/lib/slaw/generator.rb +11 -3
- data/lib/slaw/parse/builder.rb +23 -8
- data/lib/slaw/version.rb +1 -1
- data/spec/extract/extractor_spec.rb +1 -1
- data/spec/parse/builder_spec.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26b655c478bd057b9c89b7a7972984e387f9c447
|
4
|
+
data.tar.gz: 9c352182961b3c6a8dc6676c2de43e71777dc7ba
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1d882e1bbe07aa2ef08fcbeb5d82eb25aa4caede00b876a342ca050f89f4c184699c65e9a47787ceed1750644d9c634cb1281a0e070fad6f933bdca38ee6c23
|
7
|
+
data.tar.gz: 823fee99147547c3683685f443b6bd72b755fe840013059fa0ffecd02fae3a0bc3d0ccfcb66ecc6f419ba0aa2dbe42b44c27738f18db111232723a95fc169107
|
data/README.md
CHANGED
@@ -216,6 +216,15 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
|
|
216
216
|
|
217
217
|
## Changelog
|
218
218
|
|
219
|
+
### 0.8.0
|
220
|
+
|
221
|
+
* FEATURE: parse command only reformats input for PDFs or when --reformat is given
|
222
|
+
* FIX: don't error on defn tags without link to defined term
|
223
|
+
|
224
|
+
### 0.7.4
|
225
|
+
|
226
|
+
* use refersTo to identify blocks containing term definitions, rather than setting an (invalid) ID
|
227
|
+
|
219
228
|
### 0.7.3
|
220
229
|
|
221
230
|
* add link-definitions command to find and extract defined terms and link them to their definitions
|
data/bin/slaw
CHANGED
@@ -21,6 +21,7 @@ class SlawCLI < Thor
|
|
21
21
|
option :fragment, type: :string, desc: "Akoma Ntoso element name that the imported text represents. Support depends on the grammar."
|
22
22
|
option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
|
23
23
|
option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
|
24
|
+
option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
|
24
25
|
def parse(name)
|
25
26
|
logging
|
26
27
|
|
@@ -34,6 +35,7 @@ class SlawCLI < Thor
|
|
34
35
|
case options[:input]
|
35
36
|
when 'pdf'
|
36
37
|
text = extractor.extract_from_pdf(name)
|
38
|
+
options[:reformat] = true if options[:reformat].nil?
|
37
39
|
when 'text'
|
38
40
|
text = extractor.extract_from_text(name)
|
39
41
|
else
|
@@ -42,6 +44,8 @@ class SlawCLI < Thor
|
|
42
44
|
|
43
45
|
generator = Slaw::ActGenerator.new
|
44
46
|
|
47
|
+
text = generator.reformat(text) if options[:reformat]
|
48
|
+
|
45
49
|
if options[:fragment]
|
46
50
|
generator.document_class = Slaw::Fragment
|
47
51
|
|
@@ -17,14 +17,7 @@ module Slaw
|
|
17
17
|
|
18
18
|
@@pdftotext_path = "pdftotext"
|
19
19
|
|
20
|
-
#
|
21
|
-
attr_accessor :cleanser
|
22
|
-
|
23
|
-
def initialize
|
24
|
-
@cleanser = Slaw::Parse::Cleanser.new
|
25
|
-
end
|
26
|
-
|
27
|
-
# Extract text from a file and run cleanup on it.
|
20
|
+
# Extract text from a file.
|
28
21
|
#
|
29
22
|
# @param filename [String] filename to extract from
|
30
23
|
#
|
@@ -61,7 +54,7 @@ module Slaw
|
|
61
54
|
|
62
55
|
case status.exitstatus
|
63
56
|
when 0
|
64
|
-
return
|
57
|
+
return stdout
|
65
58
|
when 3
|
66
59
|
return nil if retried
|
67
60
|
retried = true
|
@@ -82,7 +75,7 @@ module Slaw
|
|
82
75
|
end
|
83
76
|
|
84
77
|
def extract_from_text(filename)
|
85
|
-
|
78
|
+
File.read(filename)
|
86
79
|
end
|
87
80
|
|
88
81
|
# Extract text from +filename+ by sending it to apache tika
|
@@ -99,15 +92,6 @@ module Slaw
|
|
99
92
|
text
|
100
93
|
end
|
101
94
|
|
102
|
-
# Run general once-off cleanup of extracted text.
|
103
|
-
def cleanup(text)
|
104
|
-
text = @cleanser.cleanup(text)
|
105
|
-
text = @cleanser.remove_empty_lines(text)
|
106
|
-
text = @cleanser.reformat(text)
|
107
|
-
|
108
|
-
text
|
109
|
-
end
|
110
|
-
|
111
95
|
def remove_pdf_password(filename)
|
112
96
|
file = Tempfile.new('steno')
|
113
97
|
begin
|
data/lib/slaw/generator.rb
CHANGED
@@ -30,10 +30,18 @@ module Slaw
|
|
30
30
|
act
|
31
31
|
end
|
32
32
|
|
33
|
+
# Run basic cleanup on text, such as ensuring clean newlines
|
34
|
+
# and removing tabs. This is always automatically done before
|
35
|
+
# processing.
|
33
36
|
def cleanup(text)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
+
@cleanser.cleanup(text)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Reformat some common errors in text to help make parsing more
|
41
|
+
# successful. Option and only recommended when processing a document
|
42
|
+
# for the first time.
|
43
|
+
def reformat(text)
|
44
|
+
@cleanser.reformat(text)
|
37
45
|
end
|
38
46
|
|
39
47
|
# Try to determine if section numbers come after titles,
|
data/lib/slaw/parse/builder.rb
CHANGED
@@ -222,16 +222,30 @@ module Slaw
|
|
222
222
|
terms = {}
|
223
223
|
doc.xpath('//a:def', a: NS).each do |defn|
|
224
224
|
# <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
|
225
|
-
|
226
|
-
|
227
|
-
|
225
|
+
if defn['refersTo']
|
226
|
+
id = defn['refersTo'].sub(/^#/, '')
|
227
|
+
term = defn.content
|
228
|
+
terms[id] = term
|
228
229
|
|
229
|
-
|
230
|
+
logger.info("+ Found definition for: #{term}")
|
231
|
+
end
|
230
232
|
end
|
231
233
|
|
232
234
|
terms
|
233
235
|
end
|
234
236
|
|
237
|
+
# Find defined terms in the document.
|
238
|
+
#
|
239
|
+
# This looks for heading elements with the words 'definitions' or 'interpretation',
|
240
|
+
# and then looks for phrases like
|
241
|
+
#
|
242
|
+
# "this word" means something...
|
243
|
+
#
|
244
|
+
# It identifies "this word" as a defined term and wraps it in a def tag with a refersTo
|
245
|
+
# attribute referencing the term being defined. The surrounding block
|
246
|
+
# structure is also has its refersTo attribute set to the term. This way, the term
|
247
|
+
# is both marked as defined, and the container element with the full
|
248
|
+
# definition of the term is identified.
|
235
249
|
def guess_at_definitions(doc)
|
236
250
|
doc.xpath('//a:section', a: NS).select do |section|
|
237
251
|
# sections with headings like Definitions
|
@@ -254,16 +268,17 @@ module Slaw
|
|
254
268
|
term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
|
255
269
|
|
256
270
|
# <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
|
257
|
-
|
271
|
+
refersTo = "##{term_id}"
|
272
|
+
defn = doc.create_element('def', term, refersTo: refersTo)
|
258
273
|
rest = match.post_match
|
259
274
|
|
260
275
|
text.before(defn)
|
261
276
|
defn.before(doc.create_text_node('"'))
|
262
277
|
text.content = '"' + rest
|
263
278
|
|
264
|
-
# adjust the container's
|
265
|
-
parent = find_up(container, ['
|
266
|
-
parent['
|
279
|
+
# adjust the container's refersTo attribute
|
280
|
+
parent = find_up(container, ['item', 'point', 'blockList', 'list', 'paragraph', 'subsection', 'section', 'chapter', 'part'])
|
281
|
+
parent['refersTo'] = refersTo
|
267
282
|
end
|
268
283
|
end
|
269
284
|
end
|
data/lib/slaw/version.rb
CHANGED
data/spec/parse/builder_spec.rb
CHANGED
@@ -617,14 +617,14 @@ XML
|
|
617
617
|
subject.guess_at_definitions(doc)
|
618
618
|
doc.to_s.should == section(<<XML
|
619
619
|
<heading>Definitions</heading>
|
620
|
-
<subsection id="
|
620
|
+
<subsection id="section-1.subsection-1" refersTo="#term-authorised_official">
|
621
621
|
<content>
|
622
622
|
<p>"<def refersTo="#term-authorised_official">authorised official</def>" means any official of the Council who has been authorised by it to administer, implement and enforce the provisions of these By-laws;</p>
|
623
623
|
</content>
|
624
624
|
</subsection>
|
625
625
|
<subsection id="section-1.subsection-2">
|
626
626
|
<content>
|
627
|
-
<blockList id="
|
627
|
+
<blockList id="section-1.subsection-2.list2" refersTo="#term-Council">
|
628
628
|
<listIntroduction>"<def refersTo="#term-Council">Council</def>" means – </listIntroduction>
|
629
629
|
<item id="section-1.subsection-2.list2.a">
|
630
630
|
<num>(a)</num>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|