slaw 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea560c6ca4986938adc55f944e1530e2f213af40
4
- data.tar.gz: 27c25b8edf02ca79a169f3ba8ae6e8215f44c716
3
+ metadata.gz: 26b655c478bd057b9c89b7a7972984e387f9c447
4
+ data.tar.gz: 9c352182961b3c6a8dc6676c2de43e71777dc7ba
5
5
  SHA512:
6
- metadata.gz: 6c456e3373e43272b6ec6c8e15bc848c61e479cca955ff454daf06949c947ddc6b26b02b2f3d995663e608a434bea91553d60b13aa5a31965bd2509ad2840c0e
7
- data.tar.gz: 38741a5d5cd8a9a5c72b9959719f6051c0e0ae88b56a8be50bd6fb3bc7f2e67c4121a3ec604047209f22be3ee46764c3898f0df8bbbeaa9630b4da63f634dce6
6
+ metadata.gz: f1d882e1bbe07aa2ef08fcbeb5d82eb25aa4caede00b876a342ca050f89f4c184699c65e9a47787ceed1750644d9c634cb1281a0e070fad6f933bdca38ee6c23
7
+ data.tar.gz: 823fee99147547c3683685f443b6bd72b755fe840013059fa0ffecd02fae3a0bc3d0ccfcb66ecc6f419ba0aa2dbe42b44c27738f18db111232723a95fc169107
data/README.md CHANGED
@@ -216,6 +216,15 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
216
216
 
217
217
  ## Changelog
218
218
 
219
+ ### 0.8.0
220
+
221
+ * FEATURE: parse command only reformats input for PDFs or when --reformat is given
222
+ * FIX: don't error on defn tags without link to defined term
223
+
224
+ ### 0.7.4
225
+
226
+ * use refersTo to identify blocks containing term definitions, rather than setting an (invalid) ID
227
+
219
228
  ### 0.7.3
220
229
 
221
230
  * add link-definitions command to find and extract defined terms and link them to their definitions
data/bin/slaw CHANGED
@@ -21,6 +21,7 @@ class SlawCLI < Thor
21
21
  option :fragment, type: :string, desc: "Akoma Ntoso element name that the imported text represents. Support depends on the grammar."
22
22
  option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
23
23
  option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
24
+ option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
24
25
  def parse(name)
25
26
  logging
26
27
 
@@ -34,6 +35,7 @@ class SlawCLI < Thor
34
35
  case options[:input]
35
36
  when 'pdf'
36
37
  text = extractor.extract_from_pdf(name)
38
+ options[:reformat] = true if options[:reformat].nil?
37
39
  when 'text'
38
40
  text = extractor.extract_from_text(name)
39
41
  else
@@ -42,6 +44,8 @@ class SlawCLI < Thor
42
44
 
43
45
  generator = Slaw::ActGenerator.new
44
46
 
47
+ text = generator.reformat(text) if options[:reformat]
48
+
45
49
  if options[:fragment]
46
50
  generator.document_class = Slaw::Fragment
47
51
 
@@ -17,14 +17,7 @@ module Slaw
17
17
 
18
18
  @@pdftotext_path = "pdftotext"
19
19
 
20
- # Object with text cleaning helpers
21
- attr_accessor :cleanser
22
-
23
- def initialize
24
- @cleanser = Slaw::Parse::Cleanser.new
25
- end
26
-
27
- # Extract text from a file and run cleanup on it.
20
+ # Extract text from a file.
28
21
  #
29
22
  # @param filename [String] filename to extract from
30
23
  #
@@ -61,7 +54,7 @@ module Slaw
61
54
 
62
55
  case status.exitstatus
63
56
  when 0
64
- return cleanup(stdout)
57
+ return stdout
65
58
  when 3
66
59
  return nil if retried
67
60
  retried = true
@@ -82,7 +75,7 @@ module Slaw
82
75
  end
83
76
 
84
77
  def extract_from_text(filename)
85
- cleanup(File.read(filename))
78
+ File.read(filename)
86
79
  end
87
80
 
88
81
  # Extract text from +filename+ by sending it to apache tika
@@ -99,15 +92,6 @@ module Slaw
99
92
  text
100
93
  end
101
94
 
102
- # Run general once-off cleanup of extracted text.
103
- def cleanup(text)
104
- text = @cleanser.cleanup(text)
105
- text = @cleanser.remove_empty_lines(text)
106
- text = @cleanser.reformat(text)
107
-
108
- text
109
- end
110
-
111
95
  def remove_pdf_password(filename)
112
96
  file = Tempfile.new('steno')
113
97
  begin
@@ -30,10 +30,18 @@ module Slaw
30
30
  act
31
31
  end
32
32
 
33
+ # Run basic cleanup on text, such as ensuring clean newlines
34
+ # and removing tabs. This is always automatically done before
35
+ # processing.
33
36
  def cleanup(text)
34
- text = @cleanser.cleanup(text)
35
- text = @cleanser.reformat(text)
36
- text
37
+ @cleanser.cleanup(text)
38
+ end
39
+
40
+ # Reformat some common errors in text to help make parsing more
41
+ # successful. Option and only recommended when processing a document
42
+ # for the first time.
43
+ def reformat(text)
44
+ @cleanser.reformat(text)
37
45
  end
38
46
 
39
47
  # Try to determine if section numbers come after titles,
@@ -222,16 +222,30 @@ module Slaw
222
222
  terms = {}
223
223
  doc.xpath('//a:def', a: NS).each do |defn|
224
224
  # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
225
- id = defn['refersTo'].sub(/^#/, '')
226
- term = defn.content
227
- terms[id] = term
225
+ if defn['refersTo']
226
+ id = defn['refersTo'].sub(/^#/, '')
227
+ term = defn.content
228
+ terms[id] = term
228
229
 
229
- logger.info("+ Found definition for: #{term}")
230
+ logger.info("+ Found definition for: #{term}")
231
+ end
230
232
  end
231
233
 
232
234
  terms
233
235
  end
234
236
 
237
+ # Find defined terms in the document.
238
+ #
239
+ # This looks for heading elements with the words 'definitions' or 'interpretation',
240
+ # and then looks for phrases like
241
+ #
242
+ # "this word" means something...
243
+ #
244
+ # It identifies "this word" as a defined term and wraps it in a def tag with a refersTo
245
+ # attribute referencing the term being defined. The surrounding block
246
+ # structure is also has its refersTo attribute set to the term. This way, the term
247
+ # is both marked as defined, and the container element with the full
248
+ # definition of the term is identified.
235
249
  def guess_at_definitions(doc)
236
250
  doc.xpath('//a:section', a: NS).select do |section|
237
251
  # sections with headings like Definitions
@@ -254,16 +268,17 @@ module Slaw
254
268
  term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
255
269
 
256
270
  # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
257
- defn = doc.create_element('def', term, refersTo: "##{term_id}")
271
+ refersTo = "##{term_id}"
272
+ defn = doc.create_element('def', term, refersTo: refersTo)
258
273
  rest = match.post_match
259
274
 
260
275
  text.before(defn)
261
276
  defn.before(doc.create_text_node('"'))
262
277
  text.content = '"' + rest
263
278
 
264
- # adjust the container's id
265
- parent = find_up(container, ['blockList', 'point']) || find_up(container, ['subsection', 'section'])
266
- parent['id'] = "def-#{term_id}"
279
+ # adjust the container's refersTo attribute
280
+ parent = find_up(container, ['item', 'point', 'blockList', 'list', 'paragraph', 'subsection', 'section', 'chapter', 'part'])
281
+ parent['refersTo'] = refersTo
267
282
  end
268
283
  end
269
284
  end
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.7.3"
2
+ VERSION = "0.8.0"
3
3
  end
@@ -9,6 +9,6 @@ describe Slaw::Extract::Extractor do
9
9
  f.write('This is some text')
10
10
  f.rewind
11
11
 
12
- subject.extract_from_file(f.path).should == "This is some text\n"
12
+ subject.extract_from_file(f.path).should == "This is some text"
13
13
  end
14
14
  end
@@ -617,14 +617,14 @@ XML
617
617
  subject.guess_at_definitions(doc)
618
618
  doc.to_s.should == section(<<XML
619
619
  <heading>Definitions</heading>
620
- <subsection id="def-term-authorised_official">
620
+ <subsection id="section-1.subsection-1" refersTo="#term-authorised_official">
621
621
  <content>
622
622
  <p>"<def refersTo="#term-authorised_official">authorised official</def>" means any official of the Council who has been authorised by it to administer, implement and enforce the provisions of these By-laws;</p>
623
623
  </content>
624
624
  </subsection>
625
625
  <subsection id="section-1.subsection-2">
626
626
  <content>
627
- <blockList id="def-term-Council">
627
+ <blockList id="section-1.subsection-2.list2" refersTo="#term-Council">
628
628
  <listIntroduction>"<def refersTo="#term-Council">Council</def>" means – </listIntroduction>
629
629
  <item id="section-1.subsection-2.list2.a">
630
630
  <num>(a)</num>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-25 00:00:00.000000000 Z
11
+ date: 2015-09-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler