slaw 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +18 -2
- data/lib/slaw.rb +1 -0
- data/lib/slaw/parse/cleanser.rb +10 -1
- data/lib/slaw/version.rb +1 -1
- data/spec/parse/cleanser_spec.rb +12 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d5870ba16d21c7e3577e3b03f4d361a0afc4f60f
|
4
|
+
data.tar.gz: 7ce6c9cdee52a9156a1b5a26b1a75a583bae80ab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 24a30dd3aa44f5fa2416548d995fb52228a53c729b0f9834afe1af93c5623f9f22925e9fd939a6611f9b5869b2ed4531db95e059196d0738891c195e861fa42c
|
7
|
+
data.tar.gz: 7880f7ff8953864a056a35864a7c4d4a6a6d4573d755d1f1b8deb4b748e2c5e0d791e0926c837d97abc775aef67dc2c667018a05bb90438ac9964bb73a810683
|
data/README.md
CHANGED
@@ -70,7 +70,7 @@ extractor = Slaw::Extract::Extractor.new
|
|
70
70
|
text = extractor.extract_from_pdf('/path/to/file.pdf')
|
71
71
|
|
72
72
|
# parse the text into a XML and
|
73
|
-
generator = Slaw::ZA::
|
73
|
+
generator = Slaw::ZA::BylawGenerator.new
|
74
74
|
bylaw = generator.generate_from_text(text)
|
75
75
|
puts bylaw.to_xml(indent: 2)
|
76
76
|
|
@@ -125,7 +125,12 @@ between section titles before and after a section number:
|
|
125
125
|
Definitions
|
126
126
|
1. In this by-law, the following words ...
|
127
127
|
|
128
|
-
This must be set by the user before parsing
|
128
|
+
This must be set by the user before parsing:
|
129
|
+
|
130
|
+
```ruby
|
131
|
+
generator = Slaw::ZA::BylawGenerator.new
|
132
|
+
generator.parser.options = {section_number_after_title: true}
|
133
|
+
```
|
129
134
|
|
130
135
|
The parser does its best not to choke on input it doesn't understand, preferring a best effort
|
131
136
|
to a completely accurate result. For example it may not be able to work out a section heading
|
@@ -157,6 +162,17 @@ apply layout rules with CSS.
|
|
157
162
|
|
158
163
|
Slaw can render either an entire document like this, or just a portion of the XML tree.
|
159
164
|
|
165
|
+
```ruby
|
166
|
+
# render an entire document
|
167
|
+
renderer = Slaw::Render::HTMLRenderer.new
|
168
|
+
puts renderer.render(bylaw.doc, '/')
|
169
|
+
|
170
|
+
# render the first section only
|
171
|
+
puts renderer.render(bylaw.sections[0], '/')
|
172
|
+
```
|
173
|
+
|
174
|
+
For more information, see [/lib/slaw/render/html.rb](/lib/slaw/render/html.rb).
|
175
|
+
|
160
176
|
## Meta-data
|
161
177
|
|
162
178
|
Acts and by-laws have metadata which it is not possible to get from their plain text representations,
|
data/lib/slaw.rb
CHANGED
data/lib/slaw/parse/cleanser.rb
CHANGED
@@ -66,11 +66,17 @@ module Slaw
|
|
66
66
|
s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
|
67
67
|
.gsub(/^.*Provincial Gazette \d+.*$/i, '')\
|
68
68
|
.gsub(/^.*Provinsiale Koerant \d+.*$/i, '')\
|
69
|
+
.gsub(/^.*PROVINCIAL GAZETTE.*$/, '')\
|
70
|
+
.gsub(/^.*PROVINSIALE KOERANT.*$/, '')\
|
69
71
|
.gsub(/^\s*\d+\s*$/, '')\
|
72
|
+
.gsub(/^.*This gazette is also available.*$/, '')\
|
70
73
|
# get rid of date lines
|
71
74
|
.gsub(/^\d+\s+\w+\s+\d+$/, '')\
|
72
75
|
# get rid of page number lines
|
73
|
-
.gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
|
76
|
+
.gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')\
|
77
|
+
.gsub(/^\s*\d*\s*No\. \d+$/, '')\
|
78
|
+
# get rid of lines with lots of ____ or ---- chars, they're usually pagebreaks
|
79
|
+
.gsub(/^.*[_-]{5}.*$/, '')
|
74
80
|
end
|
75
81
|
|
76
82
|
# Get rid of whitespace at the end of lines and at the start and end of the
|
@@ -118,6 +124,9 @@ module Slaw
|
|
118
124
|
# "foo" means ...; "bar" means
|
119
125
|
s = s.gsub(/; (["”“][^"”“]+?["”“] means)/, ";\n\\1")
|
120
126
|
|
127
|
+
# CHAPTER 4 PARKING METER PARKING GROUNDS Place of parking
|
128
|
+
s = s.gsub(/([A-Z0-9 ]{5,}) ([A-Z][a-z ]{5,})/, "\\1\n\\2")
|
129
|
+
|
121
130
|
s
|
122
131
|
end
|
123
132
|
|
data/lib/slaw/version.rb
CHANGED
data/spec/parse/cleanser_spec.rb
CHANGED
@@ -61,6 +61,10 @@ permit; and
|
|
61
61
|
it 'should clean up wrapped definition lines after pdf' do
|
62
62
|
subject.break_lines('“agricultural holding” means a portion of land not less than 0.8 hectares in extent used solely or mainly for the purpose of agriculture, horticulture or for breeding or keeping domesticated animals, poultry or bees; “approved” means as approved by the Council; “bund wall” means a containment wall surrounding an above ground storage tank, constructed of an impervious material and designed to contain 110% of the contents of the tank; “certificate of fitness” means a certificate contemplated in section 20; “certificate of registration” means a certificate contemplated in section 35;').should == "“agricultural holding” means a portion of land not less than 0.8 hectares in extent used solely or mainly for the purpose of agriculture, horticulture or for breeding or keeping domesticated animals, poultry or bees;\n“approved” means as approved by the Council;\n“bund wall” means a containment wall surrounding an above ground storage tank, constructed of an impervious material and designed to contain 110% of the contents of the tank;\n“certificate of fitness” means a certificate contemplated in section 20;\n“certificate of registration” means a certificate contemplated in section 35;"
|
63
63
|
end
|
64
|
+
|
65
|
+
it 'should break at CAPCASE TO Normal Case' do
|
66
|
+
subject.break_lines('CHAPTER 3 PARKING METER PARKING GROUNDS Place of parking 7. No person may park or cause or permit to be parked any vehicle or allow a vehicle to be or remain in a parking meter parking ground otherwise than in a parking bay.').should == "CHAPTER 3 PARKING METER PARKING GROUNDS\nPlace of parking 7. No person may park or cause or permit to be parked any vehicle or allow a vehicle to be or remain in a parking meter parking ground otherwise than in a parking bay."
|
67
|
+
end
|
64
68
|
end
|
65
69
|
|
66
70
|
describe '#strip_toc' do
|
@@ -125,4 +129,12 @@ Definitions and interpretation
|
|
125
129
|
1. (1) In this Chapter, unless the context otherwise indicates-"
|
126
130
|
end
|
127
131
|
end
|
132
|
+
|
133
|
+
describe '#remove_boilerplate' do
|
134
|
+
it 'should handle no toc' do
|
135
|
+
s = "(2)The provisions of section 12 (1) (a), (b), (d) and (g) and section 12(2), (3), (4) and (5), read with the necessary changes, apply to the taking into custody of cats.\nClaiming of impounded dogs and cats\n_____________________________________________________________________________________________ ___________ By-laws relating to Dogs and Cats for Promulgation\n14. (1) Any person may claim an impounded dog or cat if he or she –\n(a) satisfies the poundmaster that he or she is the owner or is otherwise entitled to the custody of the dog or cat concerned;"
|
136
|
+
|
137
|
+
subject.remove_boilerplate(s).should == "(2)The provisions of section 12 (1) (a), (b), (d) and (g) and section 12(2), (3), (4) and (5), read with the necessary changes, apply to the taking into custody of cats.\nClaiming of impounded dogs and cats\n\n14. (1) Any person may claim an impounded dog or cat if he or she –\n(a) satisfies the poundmaster that he or she is the owner or is otherwise entitled to the custody of the dog or cat concerned;"
|
138
|
+
end
|
139
|
+
end
|
128
140
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|