slaw 0.7.3 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea560c6ca4986938adc55f944e1530e2f213af40
4
- data.tar.gz: 27c25b8edf02ca79a169f3ba8ae6e8215f44c716
3
+ metadata.gz: 26b655c478bd057b9c89b7a7972984e387f9c447
4
+ data.tar.gz: 9c352182961b3c6a8dc6676c2de43e71777dc7ba
5
5
  SHA512:
6
- metadata.gz: 6c456e3373e43272b6ec6c8e15bc848c61e479cca955ff454daf06949c947ddc6b26b02b2f3d995663e608a434bea91553d60b13aa5a31965bd2509ad2840c0e
7
- data.tar.gz: 38741a5d5cd8a9a5c72b9959719f6051c0e0ae88b56a8be50bd6fb3bc7f2e67c4121a3ec604047209f22be3ee46764c3898f0df8bbbeaa9630b4da63f634dce6
6
+ metadata.gz: f1d882e1bbe07aa2ef08fcbeb5d82eb25aa4caede00b876a342ca050f89f4c184699c65e9a47787ceed1750644d9c634cb1281a0e070fad6f933bdca38ee6c23
7
+ data.tar.gz: 823fee99147547c3683685f443b6bd72b755fe840013059fa0ffecd02fae3a0bc3d0ccfcb66ecc6f419ba0aa2dbe42b44c27738f18db111232723a95fc169107
data/README.md CHANGED
@@ -216,6 +216,15 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
216
216
 
217
217
  ## Changelog
218
218
 
219
+ ### 0.8.0
220
+
221
+ * FEATURE: parse command only reformats input for PDFs or when --reformat is given
222
+ * FIX: don't error on defn tags without link to defined term
223
+
224
+ ### 0.7.4
225
+
226
+ * use refersTo to identify blocks containing term definitions, rather than setting an (invalid) ID
227
+
219
228
  ### 0.7.3
220
229
 
221
230
  * add link-definitions command to find and extract defined terms and link them to their definitions
data/bin/slaw CHANGED
@@ -21,6 +21,7 @@ class SlawCLI < Thor
21
21
  option :fragment, type: :string, desc: "Akoma Ntoso element name that the imported text represents. Support depends on the grammar."
22
22
  option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
23
23
  option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
24
+ option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
24
25
  def parse(name)
25
26
  logging
26
27
 
@@ -34,6 +35,7 @@ class SlawCLI < Thor
34
35
  case options[:input]
35
36
  when 'pdf'
36
37
  text = extractor.extract_from_pdf(name)
38
+ options[:reformat] = true if options[:reformat].nil?
37
39
  when 'text'
38
40
  text = extractor.extract_from_text(name)
39
41
  else
@@ -42,6 +44,8 @@ class SlawCLI < Thor
42
44
 
43
45
  generator = Slaw::ActGenerator.new
44
46
 
47
+ text = generator.reformat(text) if options[:reformat]
48
+
45
49
  if options[:fragment]
46
50
  generator.document_class = Slaw::Fragment
47
51
 
@@ -17,14 +17,7 @@ module Slaw
17
17
 
18
18
  @@pdftotext_path = "pdftotext"
19
19
 
20
- # Object with text cleaning helpers
21
- attr_accessor :cleanser
22
-
23
- def initialize
24
- @cleanser = Slaw::Parse::Cleanser.new
25
- end
26
-
27
- # Extract text from a file and run cleanup on it.
20
+ # Extract text from a file.
28
21
  #
29
22
  # @param filename [String] filename to extract from
30
23
  #
@@ -61,7 +54,7 @@ module Slaw
61
54
 
62
55
  case status.exitstatus
63
56
  when 0
64
- return cleanup(stdout)
57
+ return stdout
65
58
  when 3
66
59
  return nil if retried
67
60
  retried = true
@@ -82,7 +75,7 @@ module Slaw
82
75
  end
83
76
 
84
77
  def extract_from_text(filename)
85
- cleanup(File.read(filename))
78
+ File.read(filename)
86
79
  end
87
80
 
88
81
  # Extract text from +filename+ by sending it to apache tika
@@ -99,15 +92,6 @@ module Slaw
99
92
  text
100
93
  end
101
94
 
102
- # Run general once-off cleanup of extracted text.
103
- def cleanup(text)
104
- text = @cleanser.cleanup(text)
105
- text = @cleanser.remove_empty_lines(text)
106
- text = @cleanser.reformat(text)
107
-
108
- text
109
- end
110
-
111
95
  def remove_pdf_password(filename)
112
96
  file = Tempfile.new('steno')
113
97
  begin
@@ -30,10 +30,18 @@ module Slaw
30
30
  act
31
31
  end
32
32
 
33
+ # Run basic cleanup on text, such as ensuring clean newlines
34
+ # and removing tabs. This is always automatically done before
35
+ # processing.
33
36
  def cleanup(text)
34
- text = @cleanser.cleanup(text)
35
- text = @cleanser.reformat(text)
36
- text
37
+ @cleanser.cleanup(text)
38
+ end
39
+
40
+ # Reformat some common errors in text to help make parsing more
41
+ # successful. Option and only recommended when processing a document
42
+ # for the first time.
43
+ def reformat(text)
44
+ @cleanser.reformat(text)
37
45
  end
38
46
 
39
47
  # Try to determine if section numbers come after titles,
@@ -222,16 +222,30 @@ module Slaw
222
222
  terms = {}
223
223
  doc.xpath('//a:def', a: NS).each do |defn|
224
224
  # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
225
- id = defn['refersTo'].sub(/^#/, '')
226
- term = defn.content
227
- terms[id] = term
225
+ if defn['refersTo']
226
+ id = defn['refersTo'].sub(/^#/, '')
227
+ term = defn.content
228
+ terms[id] = term
228
229
 
229
- logger.info("+ Found definition for: #{term}")
230
+ logger.info("+ Found definition for: #{term}")
231
+ end
230
232
  end
231
233
 
232
234
  terms
233
235
  end
234
236
 
237
+ # Find defined terms in the document.
238
+ #
239
+ # This looks for heading elements with the words 'definitions' or 'interpretation',
240
+ # and then looks for phrases like
241
+ #
242
+ # "this word" means something...
243
+ #
244
+ # It identifies "this word" as a defined term and wraps it in a def tag with a refersTo
245
+ # attribute referencing the term being defined. The surrounding block
246
+ # structure is also has its refersTo attribute set to the term. This way, the term
247
+ # is both marked as defined, and the container element with the full
248
+ # definition of the term is identified.
235
249
  def guess_at_definitions(doc)
236
250
  doc.xpath('//a:section', a: NS).select do |section|
237
251
  # sections with headings like Definitions
@@ -254,16 +268,17 @@ module Slaw
254
268
  term_id = 'term-' + term.gsub(/[^a-zA-Z0-9_-]/, '_')
255
269
 
256
270
  # <p>"<def refersTo="#term-affected_land">affected land</def>" means land in respect of which an application has been lodged in terms of section 17(1);</p>
257
- defn = doc.create_element('def', term, refersTo: "##{term_id}")
271
+ refersTo = "##{term_id}"
272
+ defn = doc.create_element('def', term, refersTo: refersTo)
258
273
  rest = match.post_match
259
274
 
260
275
  text.before(defn)
261
276
  defn.before(doc.create_text_node('"'))
262
277
  text.content = '"' + rest
263
278
 
264
- # adjust the container's id
265
- parent = find_up(container, ['blockList', 'point']) || find_up(container, ['subsection', 'section'])
266
- parent['id'] = "def-#{term_id}"
279
+ # adjust the container's refersTo attribute
280
+ parent = find_up(container, ['item', 'point', 'blockList', 'list', 'paragraph', 'subsection', 'section', 'chapter', 'part'])
281
+ parent['refersTo'] = refersTo
267
282
  end
268
283
  end
269
284
  end
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.7.3"
2
+ VERSION = "0.8.0"
3
3
  end
@@ -9,6 +9,6 @@ describe Slaw::Extract::Extractor do
9
9
  f.write('This is some text')
10
10
  f.rewind
11
11
 
12
- subject.extract_from_file(f.path).should == "This is some text\n"
12
+ subject.extract_from_file(f.path).should == "This is some text"
13
13
  end
14
14
  end
@@ -617,14 +617,14 @@ XML
617
617
  subject.guess_at_definitions(doc)
618
618
  doc.to_s.should == section(<<XML
619
619
  <heading>Definitions</heading>
620
- <subsection id="def-term-authorised_official">
620
+ <subsection id="section-1.subsection-1" refersTo="#term-authorised_official">
621
621
  <content>
622
622
  <p>"<def refersTo="#term-authorised_official">authorised official</def>" means any official of the Council who has been authorised by it to administer, implement and enforce the provisions of these By-laws;</p>
623
623
  </content>
624
624
  </subsection>
625
625
  <subsection id="section-1.subsection-2">
626
626
  <content>
627
- <blockList id="def-term-Council">
627
+ <blockList id="section-1.subsection-2.list2" refersTo="#term-Council">
628
628
  <listIntroduction>"<def refersTo="#term-Council">Council</def>" means – </listIntroduction>
629
629
  <item id="section-1.subsection-2.list2.a">
630
630
  <num>(a)</num>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-25 00:00:00.000000000 Z
11
+ date: 2015-09-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler