slaw 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 30603c7c9387a2f1c2fc9d617f667b41824e0b68
4
- data.tar.gz: 2b153cb4679f469f4b0b18e4ba8b6da239d69016
3
+ metadata.gz: d5870ba16d21c7e3577e3b03f4d361a0afc4f60f
4
+ data.tar.gz: 7ce6c9cdee52a9156a1b5a26b1a75a583bae80ab
5
5
  SHA512:
6
- metadata.gz: a1fb11223dfbd14614eafaf1436e2b73c17fbdbc3fc0511f54492e3cab616c0a1db4f6ebdff0d0e56b498c23f3de1c8ce81ff7cd67e1784253dd6f22457976a2
7
- data.tar.gz: 5d46d60e58c26cc44fef10a23f81de366361d538bfe35f649b6f6a81ea121ade63785d76d081fba26fc6413981ba831771f1c32aa5014bb8a02159122f3f40d5
6
+ metadata.gz: 24a30dd3aa44f5fa2416548d995fb52228a53c729b0f9834afe1af93c5623f9f22925e9fd939a6611f9b5869b2ed4531db95e059196d0738891c195e861fa42c
7
+ data.tar.gz: 7880f7ff8953864a056a35864a7c4d4a6a6d4573d755d1f1b8deb4b748e2c5e0d791e0926c837d97abc775aef67dc2c667018a05bb90438ac9964bb73a810683
data/README.md CHANGED
@@ -70,7 +70,7 @@ extractor = Slaw::Extract::Extractor.new
70
70
  text = extractor.extract_from_pdf('/path/to/file.pdf')
71
71
 
72
72
  # parse the text into a XML and
73
- generator = Slaw::ZA::ByLawGenerator.new
73
+ generator = Slaw::ZA::BylawGenerator.new
74
74
  bylaw = generator.generate_from_text(text)
75
75
  puts bylaw.to_xml(indent: 2)
76
76
 
@@ -125,7 +125,12 @@ between section titles before and after a section number:
125
125
  Definitions
126
126
  1. In this by-law, the following words ...
127
127
 
128
- This must be set by the user before parsing.
128
+ This must be set by the user before parsing:
129
+
130
+ ```ruby
131
+ generator = Slaw::ZA::BylawGenerator.new
132
+ generator.parser.options = {section_number_after_title: true}
133
+ ```
129
134
 
130
135
  The parser does its best not to choke on input it doesn't understand, preferring a best effort
131
136
  to a completely accurate result. For example it may not be able to work out a section heading
@@ -157,6 +162,17 @@ apply layout rules with CSS.
157
162
 
158
163
  Slaw can render either an entire document like this, or just a portion of the XML tree.
159
164
 
165
+ ```ruby
166
+ # render an entire document
167
+ renderer = Slaw::Render::HTMLRenderer.new
168
+ puts renderer.render(bylaw.doc, '/')
169
+
170
+ # render the first section only
171
+ puts renderer.render(bylaw.sections[0], '/')
172
+ ```
173
+
174
+ For more information, see [/lib/slaw/render/html.rb](/lib/slaw/render/html.rb).
175
+
160
176
  ## Meta-data
161
177
 
162
178
  Acts and by-laws have metadata which it is not possible to get from their plain text representations,
data/lib/slaw.rb CHANGED
@@ -17,6 +17,7 @@ require 'slaw/parse/builder'
17
17
  require 'slaw/parse/cleanser'
18
18
  require 'slaw/parse/error'
19
19
 
20
+ require 'slaw/za/bylaw_generator'
20
21
  require 'slaw/extract/extractor'
21
22
 
22
23
  module Slaw
@@ -66,11 +66,17 @@ module Slaw
66
66
  s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
67
67
  .gsub(/^.*Provincial Gazette \d+.*$/i, '')\
68
68
  .gsub(/^.*Provinsiale Koerant \d+.*$/i, '')\
69
+ .gsub(/^.*PROVINCIAL GAZETTE.*$/, '')\
70
+ .gsub(/^.*PROVINSIALE KOERANT.*$/, '')\
69
71
  .gsub(/^\s*\d+\s*$/, '')\
72
+ .gsub(/^.*This gazette is also available.*$/, '')\
70
73
  # get rid of date lines
71
74
  .gsub(/^\d+\s+\w+\s+\d+$/, '')\
72
75
  # get rid of page number lines
73
- .gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
76
+ .gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')\
77
+ .gsub(/^\s*\d*\s*No\. \d+$/, '')\
78
+ # get rid of lines with lots of ____ or ---- chars, they're usually pagebreaks
79
+ .gsub(/^.*[_-]{5}.*$/, '')
74
80
  end
75
81
 
76
82
  # Get rid of whitespace at the end of lines and at the start and end of the
@@ -118,6 +124,9 @@ module Slaw
118
124
  # "foo" means ...; "bar" means
119
125
  s = s.gsub(/; (["”“][^"”“]+?["”“] means)/, ";\n\\1")
120
126
 
127
+ # CHAPTER 4 PARKING METER PARKING GROUNDS Place of parking
128
+ s = s.gsub(/([A-Z0-9 ]{5,}) ([A-Z][a-z ]{5,})/, "\\1\n\\2")
129
+
121
130
  s
122
131
  end
123
132
 
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.1"
3
3
  end
@@ -61,6 +61,10 @@ permit; and
61
61
  it 'should clean up wrapped definition lines after pdf' do
62
62
  subject.break_lines('“agricultural holding” means a portion of land not less than 0.8 hectares in extent used solely or mainly for the purpose of agriculture, horticulture or for breeding or keeping domesticated animals, poultry or bees; “approved” means as approved by the Council; “bund wall” means a containment wall surrounding an above ground storage tank, constructed of an impervious material and designed to contain 110% of the contents of the tank; “certificate of fitness” means a certificate contemplated in section 20; “certificate of registration” means a certificate contemplated in section 35;').should == "“agricultural holding” means a portion of land not less than 0.8 hectares in extent used solely or mainly for the purpose of agriculture, horticulture or for breeding or keeping domesticated animals, poultry or bees;\n“approved” means as approved by the Council;\n“bund wall” means a containment wall surrounding an above ground storage tank, constructed of an impervious material and designed to contain 110% of the contents of the tank;\n“certificate of fitness” means a certificate contemplated in section 20;\n“certificate of registration” means a certificate contemplated in section 35;"
63
63
  end
64
+
65
+ it 'should break at CAPCASE TO Normal Case' do
66
+ subject.break_lines('CHAPTER 3 PARKING METER PARKING GROUNDS Place of parking 7. No person may park or cause or permit to be parked any vehicle or allow a vehicle to be or remain in a parking meter parking ground otherwise than in a parking bay.').should == "CHAPTER 3 PARKING METER PARKING GROUNDS\nPlace of parking 7. No person may park or cause or permit to be parked any vehicle or allow a vehicle to be or remain in a parking meter parking ground otherwise than in a parking bay."
67
+ end
64
68
  end
65
69
 
66
70
  describe '#strip_toc' do
@@ -125,4 +129,12 @@ Definitions and interpretation
125
129
  1. (1) In this Chapter, unless the context otherwise indicates-"
126
130
  end
127
131
  end
132
+
133
+ describe '#remove_boilerplate' do
134
+ it 'should handle no toc' do
135
+ s = "(2)The provisions of section 12 (1) (a), (b), (d) and (g) and section 12(2), (3), (4) and (5), read with the necessary changes, apply to the taking into custody of cats.\nClaiming of impounded dogs and cats\n_____________________________________________________________________________________________ ___________ By-laws relating to Dogs and Cats for Promulgation\n14. (1) Any person may claim an impounded dog or cat if he or she –\n(a) satisfies the poundmaster that he or she is the owner or is otherwise entitled to the custody of the dog or cat concerned;"
136
+
137
+ subject.remove_boilerplate(s).should == "(2)The provisions of section 12 (1) (a), (b), (d) and (g) and section 12(2), (3), (4) and (5), read with the necessary changes, apply to the taking into custody of cats.\nClaiming of impounded dogs and cats\n\n14. (1) Any person may claim an impounded dog or cat if he or she –\n(a) satisfies the poundmaster that he or she is the owner or is otherwise entitled to the custody of the dog or cat concerned;"
138
+ end
139
+ end
128
140
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-23 00:00:00.000000000 Z
11
+ date: 2014-11-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler