slaw 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/Gemfile +0 -1
- data/README.md +25 -3
- data/Rakefile +2 -0
- data/lib/slaw.rb +2 -1
- data/lib/slaw/act.rb +101 -41
- data/lib/slaw/bylaw.rb +16 -23
- data/lib/slaw/collection.rb +30 -2
- data/lib/slaw/extract/extractor.rb +93 -0
- data/lib/slaw/parse/blocklists.rb +25 -17
- data/lib/slaw/parse/builder.rb +97 -22
- data/lib/slaw/parse/bylaw.treetop +2 -0
- data/lib/slaw/parse/cleanser.rb +14 -8
- data/lib/slaw/parse/grammar_helpers.rb +4 -0
- data/lib/slaw/render/html.rb +25 -8
- data/lib/slaw/render/xsl/elements.xsl +1 -1
- data/lib/slaw/version.rb +1 -1
- data/slaw.gemspec +0 -1
- data/spec/extract/extractor_spec.rb +14 -0
- data/spec/parse/builder_spec.rb +2 -0
- data/spec/parse/bylaw_spec.rb +2 -0
- data/spec/parse/cleanser_spec.rb +2 -0
- metadata +6 -17
- data/lib/slaw/elasticsearch.rb +0 -107
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 680301c5ade280eb7da5ea92c509f491631824f2
|
4
|
+
data.tar.gz: f2ddd5a99631121bf3693da5f229e38a6f590142
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 844130f24fa5e4e7e2acd8bacc9381bbd043591676a4fd22e9f1deec87e99b813f3062e4c4ec7286aca4ec0fe2a17161c39d85f5a07c8819192c82cd6203e474
|
7
|
+
data.tar.gz: de11ab3cb747c7341209e79f131506f6e2fc44065a73d95bb936c2b36b348646644024b0b657252106cd4c6d9f1b792ca4f7884e8f45fff4bda453da0a736cb7
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Slaw
|
1
|
+
# Slaw [![Build Status](https://travis-ci.org/longhotsummer/slaw.svg)](http://travis-ci.org/longhotsummer/slaw)
|
2
2
|
|
3
3
|
Slaw is a lightweight library for rendering and generating Akoma Ntoso acts from plain text and PDF documents.
|
4
4
|
It is used to power [openbylaws.org.za](http://openbylaws.org.za).
|
@@ -21,11 +21,33 @@ Or install it yourself as:
|
|
21
21
|
|
22
22
|
TODO: Write usage instructions here
|
23
23
|
|
24
|
+
### Extracting text from PDFs
|
25
|
+
|
26
|
+
You will need [xpdf](http://www.foolabs.com/xpdf/) to run PDF extraction. If you're
|
27
|
+
on a Mac you can use
|
28
|
+
|
29
|
+
brew install xpdf
|
30
|
+
|
31
|
+
Extracting PDFs often break lines in odd places (or doesn't break them when it should). Slaw gets around
|
32
|
+
this by running some cleanup routines on the extracted text.
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
extractor = Slaw::Extract::Extractor.new
|
36
|
+
|
37
|
+
# to guess the filetype by extension
|
38
|
+
text = extractor.extract_from_file('/path/to/file.pdf')
|
39
|
+
|
40
|
+
# or if you know it's a PDF
|
41
|
+
text = extractor.extract_from_pdf('/path/to/file.pdf')
|
42
|
+
|
43
|
+
# You can also "extract" text from a plain-text file
|
44
|
+
text = extractor.extract_from_text('/path/to/file.txt')
|
45
|
+
```
|
46
|
+
|
24
47
|
## Contributing
|
25
48
|
|
26
|
-
1. Fork it
|
49
|
+
1. Fork it at http://github.com/longhotsummer/slaw/fork
|
27
50
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
28
51
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
29
52
|
4. Push to the branch (`git push origin my-new-feature`)
|
30
53
|
5. Create new Pull Request
|
31
|
-
|
data/Rakefile
CHANGED
data/lib/slaw.rb
CHANGED
data/lib/slaw/act.rb
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
module Slaw
|
2
|
-
#
|
2
|
+
# An Act wraps a single {http://www.akomantoso.org/ AkomaNtoso 2.0 XML} act document in the form of a
|
3
|
+
# Nokogiri::XML::Document object.
|
4
|
+
#
|
5
|
+
# The Act object provides quick access to certain sections of the document,
|
6
|
+
# such as the metadata and the body, as well as common operations such as
|
7
|
+
# identifying whether it has been amended ({#amended?}), repealed
|
8
|
+
# ({#repealed?}) or what chapters ({#chapters}), parts ({#parts}) and
|
9
|
+
# sections ({#sections}) it contains.
|
3
10
|
class Act
|
4
11
|
include Slaw::Namespace
|
5
12
|
|
@@ -7,19 +14,45 @@ module Slaw
|
|
7
14
|
# Act instance itself
|
8
15
|
@@acts = {}
|
9
16
|
|
10
|
-
|
11
|
-
attr_accessor :
|
17
|
+
# [Nokogiri::XML::Document] The underlying {Nokogiri::XML::Document} instance
|
18
|
+
attr_accessor :doc
|
19
|
+
|
20
|
+
# [Nokogiri::XML::Node] The `meta` XML node
|
21
|
+
attr_accessor :meta
|
22
|
+
|
23
|
+
# [Nokogiri::XML::Node] The `body` XML node
|
24
|
+
attr_accessor :body
|
25
|
+
|
26
|
+
# [String] The year this act was published
|
27
|
+
attr_accessor :year
|
28
|
+
|
29
|
+
# [String] The act number in the year this act was published
|
30
|
+
attr_accessor :num
|
31
|
+
|
32
|
+
# [String] The FRBR URI of this act, which uniquely identifies it globally
|
33
|
+
attr_accessor :id_uri
|
34
|
+
|
35
|
+
# [String, nil] The source filename, or nil
|
36
|
+
attr_accessor :filename
|
37
|
+
|
38
|
+
# [Time, nil] The mtime of when the source file was last modified
|
39
|
+
attr_accessor :mtime
|
12
40
|
|
41
|
+
# Get the act that wraps the document that owns this XML node
|
42
|
+
# @param node [Nokogiri::XML::Node]
|
43
|
+
# @return [Act] owning act
|
13
44
|
def self.for_node(node)
|
14
45
|
@@acts[node.document]
|
15
46
|
end
|
16
47
|
|
17
|
-
# Create a new instance
|
48
|
+
# Create a new instance, loading from `filename` if given.
|
49
|
+
# @param filename [String] filename to load XML from
|
18
50
|
def initialize(filename=nil)
|
19
51
|
self.load(filename) if filename
|
20
52
|
end
|
21
53
|
|
22
|
-
# Load the XML
|
54
|
+
# Load the XML in `filename` into this instance
|
55
|
+
# @param filename [String] filename
|
23
56
|
def load(filename)
|
24
57
|
@filename = filename
|
25
58
|
@mtime = File::mtime(@filename)
|
@@ -27,7 +60,8 @@ module Slaw
|
|
27
60
|
File.open(filename) { |f| parse(f) }
|
28
61
|
end
|
29
62
|
|
30
|
-
# Parse the XML contained in the file-like object
|
63
|
+
# Parse the XML contained in the file-like object `io`
|
64
|
+
# @param io [file-like] io object with XML
|
31
65
|
def parse(io)
|
32
66
|
@doc = Nokogiri::XML(io)
|
33
67
|
@meta = @doc.at_xpath('/a:akomaNtoso/a:act/a:meta', a: NS)
|
@@ -35,10 +69,11 @@ module Slaw
|
|
35
69
|
|
36
70
|
@@acts[@doc] = self
|
37
71
|
|
38
|
-
|
72
|
+
_extract_id
|
39
73
|
end
|
40
74
|
|
41
|
-
|
75
|
+
# Parse the FRBR Uri into its constituent parts
|
76
|
+
def _extract_id
|
42
77
|
@id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
|
43
78
|
empty, @country, type, date, @num = @id_uri.split('/')
|
44
79
|
|
@@ -46,48 +81,39 @@ module Slaw
|
|
46
81
|
@year = date.split('-', 2)[0]
|
47
82
|
end
|
48
83
|
|
84
|
+
# An applicable short title for this act, either from the `FRBRalias` element
|
85
|
+
# or based on the act number and year.
|
86
|
+
# @return [String]
|
49
87
|
def short_title
|
50
|
-
|
51
|
-
|
52
|
-
if node
|
53
|
-
@short_title = node['value']
|
54
|
-
else
|
55
|
-
@short_title = "Act #{num} of #{year}"
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
@short_title
|
60
|
-
end
|
61
|
-
|
62
|
-
def url_path
|
63
|
-
"/#{@country}/acts/#{@year}/#{@num}/"
|
64
|
-
end
|
65
|
-
|
66
|
-
def url_file
|
67
|
-
"act-#{@year}-#{@num}"
|
88
|
+
node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
|
89
|
+
node ? node['value'] : "Act #{num} of #{year}"
|
68
90
|
end
|
69
91
|
|
70
|
-
# Has this act been amended?
|
92
|
+
# Has this act been amended? This is determined by testing the `contains`
|
93
|
+
# attribute of the `act` root element.
|
94
|
+
#
|
95
|
+
# @return [Boolean]
|
71
96
|
def amended?
|
72
97
|
@doc.at_xpath('/a:akomaNtoso/a:act', a: NS)['contains'] != 'originalVersion'
|
73
98
|
end
|
74
99
|
|
75
|
-
# a list of LifecycleEvent objects for amendment events, in date order
|
100
|
+
# Get a list of {Slaw::LifecycleEvent} objects for amendment events, in date order.
|
101
|
+
# @return [Array<Slaw::LifecycleEvent>] possibly empty list of lifecycle events
|
76
102
|
def amendment_events
|
77
103
|
@meta.xpath('./a:lifecycle/a:eventRef[@type="amendment"]', a: NS).map do |event|
|
78
104
|
LifecycleEvent.new(event)
|
79
105
|
end.sort_by { |e| e.date }
|
80
106
|
end
|
81
107
|
|
82
|
-
# Mark this act as being amended by another act, either
|
83
|
-
# or the details in
|
84
|
-
#
|
85
|
-
# :uri: uri of the amending act
|
86
|
-
# :title: title of the amending act
|
87
|
-
# :date: date of the amendment
|
108
|
+
# Mark this act as being amended by another act, either `act`
|
109
|
+
# or the details in `opts`.
|
88
110
|
#
|
89
111
|
# It is assumed that there can be only one amendment event on a particular
|
90
112
|
# date. An existing amendment on this date is overwritten.
|
113
|
+
#
|
114
|
+
# @option opts [String] :uri uri of the amending act
|
115
|
+
# @option opts [String] :title title of the amending act
|
116
|
+
# @option opts [String] :date date of the amendment (YYYY-MM-DD)
|
91
117
|
def amended_by!(act, opts={})
|
92
118
|
if act
|
93
119
|
opts[:uri] ||= act.id_uri
|
@@ -133,27 +159,40 @@ module Slaw
|
|
133
159
|
end
|
134
160
|
|
135
161
|
# Does this Act have parts?
|
162
|
+
# @return [Boolean]
|
136
163
|
def parts?
|
137
164
|
!parts.empty?
|
138
165
|
end
|
139
166
|
|
167
|
+
# Top-level parts of this act. Parts inside chapters are ignored.
|
168
|
+
# @return [Array<Nokogiri::XML::Node>] part nodes
|
140
169
|
def parts
|
141
170
|
@body.xpath('./a:part', a: NS)
|
142
171
|
end
|
143
172
|
|
173
|
+
# Does this Act have chapters?
|
174
|
+
# @return [Boolean]
|
144
175
|
def chapters?
|
145
176
|
!chapters.empty?
|
146
177
|
end
|
147
178
|
|
179
|
+
# Top-level chapters of this act. Chapters inside parts are ignored.
|
180
|
+
# @return [Array<Nokogiri::XML::Node>] chapter nodes
|
148
181
|
def chapters
|
149
182
|
@body.xpath('./a:chapter', a: NS)
|
150
183
|
end
|
151
184
|
|
185
|
+
# Sections of this act
|
186
|
+
# @return [Array<Nokogiri::XML::Node>] section nodes
|
152
187
|
def sections
|
153
188
|
@body.xpath('.//a:section', a: NS)
|
154
189
|
end
|
155
190
|
|
156
|
-
# The
|
191
|
+
# The primary definitions section of this act, identified by
|
192
|
+
# either an `id` of `definitions` or the first section with a heading
|
193
|
+
# of `Definitions`.
|
194
|
+
#
|
195
|
+
# @return [Nokogiri::XML::Node, nil] definitions node or nil
|
157
196
|
def definitions
|
158
197
|
# try looking for the definition list
|
159
198
|
defn = @body.at_css('#definitions')
|
@@ -166,14 +205,21 @@ module Slaw
|
|
166
205
|
nil
|
167
206
|
end
|
168
207
|
|
169
|
-
#
|
208
|
+
# An act can contain schedules, additional (generally free-form) documents
|
209
|
+
# that are addendums to the the main body. A definition element must be
|
210
|
+
# part of a separate `component` and have a `doc` element with a name attribute
|
211
|
+
# of `schedules`.
|
212
|
+
#
|
213
|
+
# @return [Nokogiri::XML::Node, nil] schedules document node
|
170
214
|
def schedules
|
171
215
|
@doc.at_xpath('/a:akomaNtoso/a:components/a:component/a:doc[@name="schedules"]/a:mainBody', a: NS)
|
172
216
|
end
|
173
217
|
|
174
|
-
# Get a map from term ids to
|
175
|
-
# where
|
176
|
-
# the XML
|
218
|
+
# Get a map from term ids to `[term, defn]` pairs,
|
219
|
+
# where `term+ is the plain text term and `defn` is
|
220
|
+
# the {Nokogiri::XML::Node} containing the definition.
|
221
|
+
#
|
222
|
+
# @return {String => List(String, Nokogiri::XML::Node)} map from strings to `[term, definition]` pairs
|
177
223
|
def term_definitions
|
178
224
|
terms = {}
|
179
225
|
|
@@ -191,23 +237,31 @@ module Slaw
|
|
191
237
|
end
|
192
238
|
|
193
239
|
# Returns the publication element, if any.
|
240
|
+
#
|
241
|
+
# @return [Nokogiri::XML::Node, nil]
|
194
242
|
def publication
|
195
243
|
@meta.at_xpath('./a:publication', a: NS)
|
196
244
|
end
|
197
245
|
|
198
246
|
# Has this by-law been repealed?
|
247
|
+
#
|
248
|
+
# @return [Boolean]
|
199
249
|
def repealed?
|
200
250
|
!!repealed_on
|
201
251
|
end
|
202
252
|
|
203
253
|
# The date on which this act was repealed, or nil if never repealed
|
254
|
+
#
|
255
|
+
# @return [String] date of repeal or nil
|
204
256
|
def repealed_on
|
205
257
|
repeal_el = repeal
|
206
258
|
repeal_el ? Time.parse(repeal_el['date']) : nil
|
207
259
|
end
|
208
260
|
|
209
261
|
# The element representing the reference that caused the repeal of this
|
210
|
-
# act, or nil
|
262
|
+
# act, or nil.
|
263
|
+
#
|
264
|
+
# @return [Nokogiri::XML::Node] element of reference to repealing act, or nil
|
211
265
|
def repealed_by
|
212
266
|
repeal_el = repeal
|
213
267
|
return nil unless repeal_el
|
@@ -216,7 +270,9 @@ module Slaw
|
|
216
270
|
@meta.at_xpath("./a:references/a:passiveRef[@id='#{source_id}']", a: NS)
|
217
271
|
end
|
218
272
|
|
219
|
-
# The XML element representing the repeal of this act, or nil
|
273
|
+
# The XML element representing the event of repeal of this act, or nil
|
274
|
+
#
|
275
|
+
# @return [Nokogiri::XML::Node]
|
220
276
|
def repeal
|
221
277
|
# <lifecycle source="#this">
|
222
278
|
# <eventRef id="e1" date="2010-07-28" source="#original" type="generation"/>
|
@@ -226,11 +282,15 @@ module Slaw
|
|
226
282
|
@meta.at_xpath('./a:lifecycle/a:eventRef[@type="repeal"]', a: NS)
|
227
283
|
end
|
228
284
|
|
285
|
+
# The date at which this particular XML manifestation of this document was generated.
|
286
|
+
#
|
287
|
+
# @return [String] date, YYYY-MM-DD
|
229
288
|
def manifestation_date
|
230
289
|
node = @meta.at_xpath('./a:identification/a:FRBRManifestation/a:FRBRdate[@name="Generation"]', a: NS)
|
231
290
|
node && node['date']
|
232
291
|
end
|
233
292
|
|
293
|
+
# The underlying nature of this act, usually `act` although subclasses my override this.
|
234
294
|
def nature
|
235
295
|
"act"
|
236
296
|
end
|
data/lib/slaw/bylaw.rb
CHANGED
@@ -1,12 +1,19 @@
|
|
1
1
|
require 'slaw/act'
|
2
2
|
|
3
3
|
module Slaw
|
4
|
-
#
|
4
|
+
# An extension of {Slaw::Act} which wraps an AkomaNtoso XML document describing an By-Law.
|
5
|
+
#
|
6
|
+
# There are minor differences between Acts and By-laws, the most notable being that a by-law
|
7
|
+
# is not identified by a year and a number, and therefore has a different FRBR uri structure.
|
5
8
|
class ByLaw < Act
|
6
9
|
|
7
|
-
|
10
|
+
# [String] The region this by-law applies to
|
11
|
+
attr_accessor :region
|
12
|
+
|
13
|
+
# [String] A short file-like name of this by-law, unique within its year and region
|
14
|
+
attr_accessor :name
|
8
15
|
|
9
|
-
def
|
16
|
+
def _extract_id
|
10
17
|
# /za/by-law/cape-town/2010/public-parks
|
11
18
|
|
12
19
|
@id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
|
@@ -22,30 +29,16 @@ module Slaw
|
|
22
29
|
end
|
23
30
|
|
24
31
|
def short_title
|
25
|
-
|
26
|
-
|
27
|
-
if node
|
28
|
-
@short_title = node['value']
|
29
|
-
else
|
30
|
-
@short_title = "(Unknown)"
|
31
|
-
end
|
32
|
-
|
33
|
-
if amended? and not @short_title.end_with?("as amended")
|
34
|
-
@short_title = @short_title + " as amended"
|
35
|
-
end
|
36
|
-
end
|
32
|
+
node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
|
33
|
+
short_title = node ? node['value'] : "(Unknown)"
|
37
34
|
|
38
|
-
|
39
|
-
|
35
|
+
if amended? and not short_title.end_with?("as amended")
|
36
|
+
short_title = short_title + " as amended"
|
37
|
+
end
|
40
38
|
|
41
|
-
|
42
|
-
"/#{@country}/by-law/#{@region}/#{@year}/#{@name}/"
|
39
|
+
short_title
|
43
40
|
end
|
44
41
|
|
45
|
-
def url_file
|
46
|
-
@name
|
47
|
-
end
|
48
|
-
|
49
42
|
def nature
|
50
43
|
"by-law"
|
51
44
|
end
|
data/lib/slaw/collection.rb
CHANGED
@@ -2,11 +2,28 @@ require 'forwardable'
|
|
2
2
|
|
3
3
|
module Slaw
|
4
4
|
# A collection of Act instances.
|
5
|
+
#
|
6
|
+
# This is useful for looking up acts by their FRBR uri and for
|
7
|
+
# loading a collection of XML act documents.
|
8
|
+
#
|
9
|
+
# This collection is enumerable and can be iterated over. Use {#items} to
|
10
|
+
# access the underlying array of objects.
|
11
|
+
#
|
12
|
+
# @example Load a collection of acts and then iterate over them.
|
13
|
+
#
|
14
|
+
# acts = Slaw::DocumentCollection.new
|
15
|
+
# acts.discover('/path/to/acts/')
|
16
|
+
#
|
17
|
+
# for act in acts
|
18
|
+
# puts act.short_name
|
19
|
+
# end
|
20
|
+
#
|
5
21
|
class DocumentCollection
|
6
22
|
|
7
23
|
include Enumerable
|
8
24
|
extend Forwardable
|
9
25
|
|
26
|
+
# [Array<Act>] The underlying array of acts
|
10
27
|
attr_accessor :items
|
11
28
|
|
12
29
|
def_delegators :items, :each, :<<, :length
|
@@ -15,16 +32,27 @@ module Slaw
|
|
15
32
|
@items = items || []
|
16
33
|
end
|
17
34
|
|
18
|
-
# Find all XML files in
|
19
|
-
#
|
35
|
+
# Find all XML files in `path` and add them into this
|
36
|
+
# collection.
|
37
|
+
#
|
38
|
+
# @param path [String] the path to glob for xml files
|
39
|
+
# @param cls [Class] the class to instantiate for each file
|
40
|
+
#
|
41
|
+
# @return [DocumentCollection] this collection
|
20
42
|
def discover(path, cls=Slaw::Act)
|
21
43
|
for fname in Dir.glob("#{path}/**/*.xml")
|
22
44
|
@items << cls.new(fname)
|
23
45
|
end
|
46
|
+
|
47
|
+
self
|
24
48
|
end
|
25
49
|
|
26
50
|
# Try to find an act who's FRBRuri matches this one,
|
27
51
|
# returning nil on failure
|
52
|
+
#
|
53
|
+
# @param uri [String] the uri to look for
|
54
|
+
#
|
55
|
+
# @return [Act, nil] the act, or nil
|
28
56
|
def for_uri(uri)
|
29
57
|
return @items.find { |doc| doc.id_uri == uri }
|
30
58
|
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module Slaw
|
4
|
+
module Extract
|
5
|
+
|
6
|
+
# Routines for extracting and cleaning up context from other formats, such as PDF.
|
7
|
+
#
|
8
|
+
# You may need to set the location of the `pdftotext` binary.
|
9
|
+
#
|
10
|
+
# On Mac OS X, use `brew install xpdf` or download from http://www.foolabs.com/xpdf/download.html
|
11
|
+
#
|
12
|
+
# On Heroku, you'll need to do some hoop jumping, see http://theprogrammingbutler.com/blog/archives/2011/07/28/running-pdftotext-on-heroku/
|
13
|
+
class Extractor
|
14
|
+
include Slaw::Logging
|
15
|
+
|
16
|
+
@@pdftotext_path = "pdftotext"
|
17
|
+
|
18
|
+
# Object with text cleaning helpers
|
19
|
+
attr_accessor :cleanser
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@cleanser = Slaw::Parse::Cleanser.new
|
23
|
+
end
|
24
|
+
|
25
|
+
# Extract text from a file and run cleanup on it.
|
26
|
+
#
|
27
|
+
# @param filename [String] filename to extract from
|
28
|
+
#
|
29
|
+
# @return [String] extracted text
|
30
|
+
def extract_from_file(filename)
|
31
|
+
ext = filename[-4..-1].downcase
|
32
|
+
|
33
|
+
case ext
|
34
|
+
when '.pdf'
|
35
|
+
extract_from_pdf(filename)
|
36
|
+
when '.txt'
|
37
|
+
extract_from_text(filename)
|
38
|
+
else
|
39
|
+
raise ArgumentError.new("Unsupported file type #{ext}")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Extract text from a PDF
|
44
|
+
#
|
45
|
+
# @param filename [String] filename to extract from
|
46
|
+
#
|
47
|
+
# @return [String] extracted text
|
48
|
+
def extract_from_pdf(filename)
|
49
|
+
cmd = pdf_to_text_cmd(filename)
|
50
|
+
logger.info("Executing: #{cmd}")
|
51
|
+
stdout, status = Open3.capture2(*cmd)
|
52
|
+
|
53
|
+
if status == 0
|
54
|
+
cleanup(stdout)
|
55
|
+
else
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Build a command for the external PDF-to-text utility.
|
61
|
+
#
|
62
|
+
# @param filename [String] the pdf file
|
63
|
+
#
|
64
|
+
# @return [Array<String>] command and params to execute
|
65
|
+
def pdf_to_text_cmd(filename)
|
66
|
+
[Extractor.pdftotext_path, "-enc", "UTF-8", filename, "-"]
|
67
|
+
end
|
68
|
+
|
69
|
+
def extract_from_text(filename)
|
70
|
+
cleanup(File.read(filename))
|
71
|
+
end
|
72
|
+
|
73
|
+
# Run general once-off cleanup of extracted text.
|
74
|
+
def cleanup(text)
|
75
|
+
text = @cleanser.cleanup(text)
|
76
|
+
text = @cleanser.remove_empty_lines(text)
|
77
|
+
text = @cleanser.reformat(text)
|
78
|
+
|
79
|
+
text
|
80
|
+
end
|
81
|
+
|
82
|
+
# Get location of the pdftotext executable for all instances.
|
83
|
+
def self.pdftotext_path
|
84
|
+
@@pdftotext_path
|
85
|
+
end
|
86
|
+
|
87
|
+
# Set location of the pdftotext executable for all instances.
|
88
|
+
def self.pdftotext_path=(val)
|
89
|
+
@@pdftotext_path = val
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -3,28 +3,36 @@ module Slaw
|
|
3
3
|
module Blocklists
|
4
4
|
include Slaw::Namespace
|
5
5
|
|
6
|
-
# Correctly re-nest nested block lists.
|
6
|
+
# Correctly re-nest nested block lists. We do this by identifying the
|
7
|
+
# numbering format of each item in the list and comparing it with the
|
8
|
+
# surrounding elements. When the numbering format changes, we start
|
9
|
+
# a new nested list.
|
7
10
|
#
|
8
|
-
# (
|
9
|
-
# (
|
10
|
-
#
|
11
|
-
# (ii)
|
12
|
-
# (aa)
|
13
|
-
# (bb)
|
14
|
-
# (c)
|
15
|
-
# (d)
|
11
|
+
# We make sure to handle special cases such as `(i)` coming between
|
12
|
+
# `(h)` and `(j)` versus being at the start of a `(i), (ii), (iii)`
|
13
|
+
# list.
|
16
14
|
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
# (i)
|
22
|
-
# (ii)
|
15
|
+
# (a)
|
16
|
+
# (b)
|
17
|
+
# (i)
|
18
|
+
# (ii)
|
23
19
|
# (aa)
|
24
20
|
# (bb)
|
25
|
-
#
|
26
|
-
#
|
21
|
+
# (c)
|
22
|
+
# (d)
|
23
|
+
#
|
24
|
+
# becomes
|
25
|
+
#
|
26
|
+
# (a)
|
27
|
+
# (b)
|
28
|
+
# (i)
|
29
|
+
# (ii)
|
30
|
+
# (aa)
|
31
|
+
# (bb)
|
32
|
+
# (c)
|
33
|
+
# (d)
|
27
34
|
#
|
35
|
+
# @param doc [Nokogiri::XML::Document] the document
|
28
36
|
def self.nest_blocklists(doc)
|
29
37
|
doc.xpath('//a:blockList', a: NS).each do |blocklist|
|
30
38
|
items = blocklist.xpath('a:item', a: NS)
|
data/lib/slaw/parse/builder.rb
CHANGED
@@ -1,25 +1,67 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require 'builder'
|
2
4
|
require 'treetop'
|
3
5
|
|
4
6
|
module Slaw
|
5
7
|
module Parse
|
6
|
-
#
|
8
|
+
# The primary class for building Akoma Ntoso documents from plain text documents.
|
9
|
+
#
|
10
|
+
# The builder uses a grammar to break down a plain-text version of an act into a
|
11
|
+
# syntax tree. This tree can then be serialized into an Akoma Ntoso compatible
|
12
|
+
# XML document.
|
13
|
+
#
|
14
|
+
# @example Parse some text into a well-formed document
|
15
|
+
# builder = Slaw::Builder.new
|
16
|
+
# xml = builder.parse_text(text)
|
17
|
+
# doc = builder.parse_xml(xml)
|
18
|
+
# builder.postprocess(doc)
|
19
|
+
#
|
20
|
+
# @example A quicker way to build a well-formed document
|
21
|
+
# builder = Slaw::Builder.new
|
22
|
+
# doc = builder.parse_and_process_text(text)
|
7
23
|
#
|
8
|
-
# It can convert from plain text a new Akoma Ntoso document, or
|
9
|
-
# update existing documents.
|
10
24
|
class Builder
|
11
25
|
include Slaw::Namespace
|
12
26
|
include Slaw::Logging
|
13
27
|
|
14
28
|
Treetop.load(File.dirname(__FILE__) + "/bylaw.treetop")
|
15
29
|
|
30
|
+
# [Hash] A Hash of options that are made available to the parser when parsing.
|
16
31
|
attr_accessor :parse_options
|
17
32
|
|
18
|
-
def initialize()
|
19
|
-
@parse_options =
|
33
|
+
def initialize(parse_options={})
|
34
|
+
@parse_options = parse_options
|
35
|
+
end
|
36
|
+
|
37
|
+
# Do all the work necessary to parse text into a well-formed XML document.
|
38
|
+
#
|
39
|
+
# @param text [String] the text to parse
|
40
|
+
# @param root [Symbol] the root element of the grammar
|
41
|
+
#
|
42
|
+
# @return [Nokogiri::XML::Document] a well formed document
|
43
|
+
def parse_and_process_text(text, root=:bylaw)
|
44
|
+
postprocess(parse_xml(parse_text(text, root)))
|
45
|
+
end
|
46
|
+
|
47
|
+
# Parse text into XML. You should still run {#postprocess} on the
|
48
|
+
# resulting XML to normalise it.
|
49
|
+
#
|
50
|
+
# @param text [String] the text to parse
|
51
|
+
# @param root [Symbol] the root element of the grammar
|
52
|
+
#
|
53
|
+
# @return [String] an XML string
|
54
|
+
def parse_text(text, root=:bylaw)
|
55
|
+
tree = text_to_syntax_tree(text, root)
|
56
|
+
xml_from_syntax_tree(tree)
|
20
57
|
end
|
21
58
|
|
22
|
-
#
|
59
|
+
# Parse plain text into a syntax tree.
|
60
|
+
#
|
61
|
+
# @param text [String] the text to parse
|
62
|
+
# @param root [Symbol] the root element of the grammar
|
63
|
+
#
|
64
|
+
# @return [Object] the root of the resulting parse tree, usually a Treetop::Node object
|
23
65
|
def text_to_syntax_tree(text, root=:bylaw)
|
24
66
|
parser = Slaw::Parse::BylawParser.new
|
25
67
|
parser.options = @parse_options
|
@@ -35,7 +77,12 @@ module Slaw
|
|
35
77
|
tree
|
36
78
|
end
|
37
79
|
|
38
|
-
# Generate an XML document from the given syntax tree.
|
80
|
+
# Generate an XML document from the given syntax tree. You should still
|
81
|
+
# run {#postprocess} on the resulting XML to normalise it.
|
82
|
+
#
|
83
|
+
# @param tree [Object] a Treetop::Node object
|
84
|
+
#
|
85
|
+
# @return [String] an XML string
|
39
86
|
def xml_from_syntax_tree(tree)
|
40
87
|
s = ""
|
41
88
|
builder = ::Builder::XmlMarkup.new(indent: 2, target: s)
|
@@ -50,38 +97,41 @@ module Slaw
|
|
50
97
|
s
|
51
98
|
end
|
52
99
|
|
100
|
+
# Parse a string into a Nokogiri::XML::Document
|
101
|
+
#
|
102
|
+
# @param xml [String] string to parse
|
103
|
+
#
|
104
|
+
# @return [Nokogiri::XML::Document]
|
53
105
|
def parse_xml(xml)
|
54
106
|
Nokogiri::XML(xml, &:noblanks)
|
55
107
|
end
|
56
108
|
|
109
|
+
# Serialise a Nokogiri::XML::Document into a string
|
110
|
+
#
|
111
|
+
# @param doc [Nokogiri::XML::Document] document
|
112
|
+
#
|
113
|
+
# @return [String] pretty printed string
|
57
114
|
def to_xml(doc)
|
58
115
|
doc.to_xml(indent: 2)
|
59
116
|
end
|
60
117
|
|
61
|
-
#
|
62
|
-
#
|
118
|
+
# Postprocess an XML document.
|
119
|
+
#
|
120
|
+
# @param doc [Nokogiri::XML::Document]
|
121
|
+
#
|
122
|
+
# @return [Nokogiri::XML::Document] the updated document
|
63
123
|
def postprocess(doc)
|
64
124
|
normalise_headings(doc)
|
65
125
|
find_short_title(doc)
|
66
|
-
sanitise(doc)
|
67
|
-
end
|
68
|
-
|
69
|
-
# Do sanitisations, such as finding and linking definitions
|
70
|
-
def sanitise(doc)
|
71
126
|
link_definitions(doc)
|
72
127
|
nest_blocklists(doc)
|
73
|
-
end
|
74
128
|
|
75
|
-
|
76
|
-
def renumber_terms(doc)
|
77
|
-
logger.info("Renumbering terms")
|
78
|
-
|
79
|
-
doc.xpath('//a:term', a: NS).each_with_index do |term, i|
|
80
|
-
term['id'] = "trm#{i}"
|
81
|
-
end
|
129
|
+
doc
|
82
130
|
end
|
83
131
|
|
84
132
|
# Change CAPCASE headings into Sentence case.
|
133
|
+
#
|
134
|
+
# @param doc [Nokogiri::XML::Document]
|
85
135
|
def normalise_headings(doc)
|
86
136
|
logger.info("Normalising headings")
|
87
137
|
|
@@ -94,6 +144,8 @@ module Slaw
|
|
94
144
|
end
|
95
145
|
|
96
146
|
# Find the short title and add it as an FRBRalias element in the meta section
|
147
|
+
#
|
148
|
+
# @param doc [Nokogiri::XML::Document]
|
97
149
|
def find_short_title(doc)
|
98
150
|
logger.info("Finding short title")
|
99
151
|
|
@@ -117,6 +169,8 @@ module Slaw
|
|
117
169
|
|
118
170
|
# Find definitions of terms and introduce them into the
|
119
171
|
# meta section of the document.
|
172
|
+
#
|
173
|
+
# @param doc [Nokogiri::XML::Document]
|
120
174
|
def link_definitions(doc)
|
121
175
|
logger.info("Finding and linking definitions")
|
122
176
|
|
@@ -126,6 +180,12 @@ module Slaw
|
|
126
180
|
renumber_terms(doc)
|
127
181
|
end
|
128
182
|
|
183
|
+
# Find `def` elements in the document and return a Hash from
|
184
|
+
# term ids to the text of each term
|
185
|
+
#
|
186
|
+
# @param doc [Nokogiri::XML::Document]
|
187
|
+
#
|
188
|
+
# @return [Hash{String, String}]
|
129
189
|
def find_definitions(doc)
|
130
190
|
guess_at_definitions(doc)
|
131
191
|
|
@@ -239,6 +299,21 @@ module Slaw
|
|
239
299
|
end
|
240
300
|
end
|
241
301
|
|
302
|
+
# recalculate ids for <term> elements
|
303
|
+
def renumber_terms(doc)
|
304
|
+
logger.info("Renumbering terms")
|
305
|
+
|
306
|
+
doc.xpath('//a:term', a: NS).each_with_index do |term, i|
|
307
|
+
term['id'] = "trm#{i}"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
# Correctly nest blocklists.
|
312
|
+
#
|
313
|
+
# The grammar gives us flat blocklists, we need to introspect the
|
314
|
+
# numbering of the lists to correctly nest them.
|
315
|
+
#
|
316
|
+
# @param doc [Nokogiri::XML::Document]
|
242
317
|
def nest_blocklists(doc)
|
243
318
|
logger.info("Nesting blocklists")
|
244
319
|
|
data/lib/slaw/parse/cleanser.rb
CHANGED
@@ -50,16 +50,17 @@ module Slaw
|
|
50
50
|
.gsub("", '')
|
51
51
|
end
|
52
52
|
|
53
|
+
# change weird quotes to normal ones
|
53
54
|
def fix_quotes(s)
|
54
|
-
# change weird quotes to normal ones
|
55
55
|
s.gsub(/‘‘|’’|''/, '"')
|
56
56
|
end
|
57
57
|
|
58
|
+
# tabs to spaces
|
58
59
|
def expand_tabs(s)
|
59
|
-
# tabs to spaces
|
60
60
|
s.gsub(/\t/, ' ')
|
61
61
|
end
|
62
62
|
|
63
|
+
# Try to remove boilerplate lines found in many files, such as page numbers.
|
63
64
|
def remove_boilerplate(s)
|
64
65
|
# nuke any line to do with Sabinet and the government printer
|
65
66
|
s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
|
@@ -72,6 +73,8 @@ module Slaw
|
|
72
73
|
.gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
|
73
74
|
end
|
74
75
|
|
76
|
+
# Get rid of whitespace at the end of lines and at the start and end of the
|
77
|
+
# entire string.
|
75
78
|
def chomp(s)
|
76
79
|
# trailing whitespace at end of lines
|
77
80
|
s = s.gsub(/ +$/, '')
|
@@ -85,8 +88,11 @@ module Slaw
|
|
85
88
|
s.end_with?("\n") ? s : (s + "\n")
|
86
89
|
end
|
87
90
|
|
88
|
-
#
|
89
|
-
# have been broken but haven't, and break them
|
91
|
+
# Make educated guesses about lines that should
|
92
|
+
# have been broken but haven't, and break them.
|
93
|
+
#
|
94
|
+
# This is very dependent on a locale's legislation grammar, there are
|
95
|
+
# lots of rules of thumb that make this work.
|
90
96
|
def break_lines(s)
|
91
97
|
# often we find a section title munged onto the same line as its first statement
|
92
98
|
# eg:
|
@@ -115,8 +121,8 @@ module Slaw
|
|
115
121
|
s
|
116
122
|
end
|
117
123
|
|
118
|
-
#
|
119
|
-
# and
|
124
|
+
# Find likely candidates for unnecessarily broken lines
|
125
|
+
# and unbreaks them.
|
120
126
|
def unbreak_lines(s)
|
121
127
|
lines = s.split(/\n/)
|
122
128
|
output = []
|
@@ -141,8 +147,8 @@ module Slaw
|
|
141
147
|
output.join("\n")
|
142
148
|
end
|
143
149
|
|
144
|
-
#
|
145
|
-
# it really confuses the grammer
|
150
|
+
# Do our best to remove table of contents at the start,
|
151
|
+
# it really confuses the grammer.
|
146
152
|
def strip_toc(s)
|
147
153
|
# first, try to find 'TABLE OF CONTENTS' anywhere within the first 4K of text,
|
148
154
|
if toc_start = s[0..4096].match(/TABLE OF CONTENTS/i)
|
data/lib/slaw/render/html.rb
CHANGED
@@ -2,7 +2,14 @@ module Slaw
|
|
2
2
|
module Render
|
3
3
|
|
4
4
|
# Support for transforming XML AN documents into HTML.
|
5
|
+
#
|
6
|
+
# This rendering is done using XSLT stylesheets. Both an entire
|
7
|
+
# document and fragments can be rendered.
|
5
8
|
class HTMLRenderer
|
9
|
+
|
10
|
+
# [Hash] A Hash of Nokogiri::XSLT objects
|
11
|
+
attr_accessor :xslt
|
12
|
+
|
6
13
|
def initialize
|
7
14
|
here = File.dirname(__FILE__)
|
8
15
|
|
@@ -12,12 +19,17 @@ module Slaw
|
|
12
19
|
}
|
13
20
|
end
|
14
21
|
|
15
|
-
# Transform an entire XML document
|
16
|
-
# Specify
|
22
|
+
# Transform an entire XML document (a Nokogiri::XML::Document object) into HTML.
|
23
|
+
# Specify `base_url` to manage the base for relative URLs generated by
|
17
24
|
# the transform.
|
25
|
+
#
|
26
|
+
# @param doc [Nokogiri::XML::Document] document to render
|
27
|
+
# @param base_url [String] root URL for relative URLs (cannot be empty)
|
28
|
+
#
|
29
|
+
# @return [String]
|
18
30
|
def render(doc, base_url='')
|
19
|
-
params =
|
20
|
-
|
31
|
+
params = _transform_params({'base_url' => base_url})
|
32
|
+
_run_xslt(:act, doc, params)
|
21
33
|
end
|
22
34
|
|
23
35
|
# Transform just a single node and its children into HTML.
|
@@ -25,8 +37,13 @@ module Slaw
|
|
25
37
|
# If +elem+ has an id, we use xpath to tell the XSLT which
|
26
38
|
# element to transform. Otherwise we copy the node into a new
|
27
39
|
# tree and apply the XSLT to that.
|
40
|
+
#
|
41
|
+
# @param node [Nokogiri::XML::Node] node to render
|
42
|
+
# @param base_url [String] root URL for relative URLs (cannot be empty)
|
43
|
+
#
|
44
|
+
# @return [String]
|
28
45
|
def render_node(node, base_url='')
|
29
|
-
params =
|
46
|
+
params = _transform_params({'base_url' => base_url})
|
30
47
|
|
31
48
|
if node.id
|
32
49
|
params += ['root_elem', "//*[@id='#{node.id}']"]
|
@@ -38,14 +55,14 @@ module Slaw
|
|
38
55
|
params += ['root_elem', '*']
|
39
56
|
end
|
40
57
|
|
41
|
-
|
58
|
+
_run_xslt(:fragment, doc, params)
|
42
59
|
end
|
43
60
|
|
44
|
-
def
|
61
|
+
def _run_xslt(xslt, doc, params)
|
45
62
|
@xslt[xslt].transform(doc, params).to_s
|
46
63
|
end
|
47
64
|
|
48
|
-
def
|
65
|
+
def _transform_params(params)
|
49
66
|
Nokogiri::XSLT.quote_params(params)
|
50
67
|
end
|
51
68
|
end
|
@@ -77,7 +77,7 @@
|
|
77
77
|
<xsl:value-of select="@refersTo" />
|
78
78
|
</xsl:attribute>
|
79
79
|
|
80
|
-
<xsl:attribute name="href"><xsl:value-of select="$base_url"
|
80
|
+
<xsl:attribute name="href"><xsl:value-of select="$base_url" />/definitions/#def-<xsl:value-of select="translate(@refersTo, '#', '')" /></xsl:attribute>
|
81
81
|
|
82
82
|
<xsl:apply-templates />
|
83
83
|
</a>
|
data/lib/slaw/version.rb
CHANGED
data/slaw.gemspec
CHANGED
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency "rspec", "~> 2.14.1"
|
24
24
|
|
25
25
|
spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
|
26
|
-
spec.add_runtime_dependency "elasticsearch", "~> 1.0.5"
|
27
26
|
spec.add_runtime_dependency "treetop", "~> 1.5"
|
28
27
|
spec.add_runtime_dependency "builder", "~> 3.2.2"
|
29
28
|
spec.add_runtime_dependency "log4r", "~> 1.1.10"
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'slaw'
|
5
|
+
|
6
|
+
describe Slaw::Extract::Extractor do
|
7
|
+
it 'should extract from plain text' do
|
8
|
+
f = Tempfile.new(['test', '.txt'])
|
9
|
+
f.write('This is some text')
|
10
|
+
f.rewind
|
11
|
+
|
12
|
+
subject.extract_from_file(f.path).should == "This is some text\n"
|
13
|
+
end
|
14
|
+
end
|
data/spec/parse/builder_spec.rb
CHANGED
data/spec/parse/bylaw_spec.rb
CHANGED
data/spec/parse/cleanser_spec.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,20 +66,6 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 1.6.0
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: elasticsearch
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - "~>"
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 1.0.5
|
76
|
-
type: :runtime
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - "~>"
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: 1.0.5
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: treetop
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -131,6 +117,7 @@ extensions: []
|
|
131
117
|
extra_rdoc_files: []
|
132
118
|
files:
|
133
119
|
- ".gitignore"
|
120
|
+
- ".travis.yml"
|
134
121
|
- Gemfile
|
135
122
|
- LICENSE.txt
|
136
123
|
- README.md
|
@@ -139,7 +126,7 @@ files:
|
|
139
126
|
- lib/slaw/act.rb
|
140
127
|
- lib/slaw/bylaw.rb
|
141
128
|
- lib/slaw/collection.rb
|
142
|
-
- lib/slaw/
|
129
|
+
- lib/slaw/extract/extractor.rb
|
143
130
|
- lib/slaw/lifecycle_event.rb
|
144
131
|
- lib/slaw/logging.rb
|
145
132
|
- lib/slaw/namespace.rb
|
@@ -157,6 +144,7 @@ files:
|
|
157
144
|
- lib/slaw/version.rb
|
158
145
|
- lib/slaw/xml_support.rb
|
159
146
|
- slaw.gemspec
|
147
|
+
- spec/extract/extractor_spec.rb
|
160
148
|
- spec/parse/builder_spec.rb
|
161
149
|
- spec/parse/bylaw_spec.rb
|
162
150
|
- spec/parse/cleanser_spec.rb
|
@@ -187,6 +175,7 @@ signing_key:
|
|
187
175
|
specification_version: 4
|
188
176
|
summary: A lightweight library for using Akoma Ntoso acts in Ruby.
|
189
177
|
test_files:
|
178
|
+
- spec/extract/extractor_spec.rb
|
190
179
|
- spec/parse/builder_spec.rb
|
191
180
|
- spec/parse/bylaw_spec.rb
|
192
181
|
- spec/parse/cleanser_spec.rb
|
data/lib/slaw/elasticsearch.rb
DELETED
@@ -1,107 +0,0 @@
|
|
1
|
-
require 'elasticsearch'
|
2
|
-
require 'log4r'
|
3
|
-
|
4
|
-
module Slaw
|
5
|
-
# Support for indexing and search using elasticsearch
|
6
|
-
class ElasticSearchSupport
|
7
|
-
attr_accessor :es, :mapping, :index, :type, :base_url
|
8
|
-
|
9
|
-
def initialize(index, type, base_url, client_params={}, es=nil)
|
10
|
-
@es = es || create_client(client_params)
|
11
|
-
|
12
|
-
@ix = index
|
13
|
-
@type = type
|
14
|
-
@base_url = base_url
|
15
|
-
|
16
|
-
@mapping = {
|
17
|
-
frbr_uri: {type: 'string', index: 'not_analyzed'},
|
18
|
-
url: {type: 'string', index: 'not_analyzed'},
|
19
|
-
title: {type: 'string', analyzer: 'english'},
|
20
|
-
content: {type: 'string', analyzer: 'english'},
|
21
|
-
published_on: {type: 'date', format: 'dateOptionalTime'},
|
22
|
-
region: {type: 'string', index: 'not_analyzed'},
|
23
|
-
region_name: {type: 'string', index: 'not_analyzed'},
|
24
|
-
repealed: {type: 'boolean'},
|
25
|
-
}
|
26
|
-
|
27
|
-
@log = Log4r::Logger['Slaw']
|
28
|
-
end
|
29
|
-
|
30
|
-
def create_client(client_params)
|
31
|
-
Elasticsearch::Client.new(client_params)
|
32
|
-
end
|
33
|
-
|
34
|
-
def reindex!(docs, &block)
|
35
|
-
define_mapping!
|
36
|
-
index_documents!(docs, &block)
|
37
|
-
end
|
38
|
-
|
39
|
-
def index_documents!(docs, &block)
|
40
|
-
for doc in docs
|
41
|
-
id = doc.id_uri.gsub('/', '-')
|
42
|
-
|
43
|
-
data = {
|
44
|
-
frbr_uri: doc.id_uri,
|
45
|
-
url: @base_url + doc.id_uri,
|
46
|
-
title: doc.short_title,
|
47
|
-
content: doc.body.text,
|
48
|
-
region: doc.region,
|
49
|
-
published_on: doc.publication['date'],
|
50
|
-
repealed: doc.repealed?,
|
51
|
-
}
|
52
|
-
|
53
|
-
yield doc, data if block_given?
|
54
|
-
|
55
|
-
@log.info("Indexing #{id}")
|
56
|
-
@es.index(index: @ix, type: @type, id: id, body: data)
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
def define_mapping!
|
61
|
-
@log.info("Deleting index")
|
62
|
-
@es.indices.create(index: @ix) unless @es.indices.exists(index: @ix)
|
63
|
-
|
64
|
-
# delete existing mapping
|
65
|
-
unless @es.indices.get_mapping(index: @ix, type: @type).empty?
|
66
|
-
@es.indices.delete_mapping(index: @ix, type: @type)
|
67
|
-
end
|
68
|
-
|
69
|
-
@log.info("Defining mappings")
|
70
|
-
@es.indices.put_mapping(index: @ix, type: @type, body: {
|
71
|
-
@type => {properties: @mapping}
|
72
|
-
})
|
73
|
-
end
|
74
|
-
|
75
|
-
def search(q, from=0, size=10)
|
76
|
-
@es.search(index: @ix, body: {
|
77
|
-
query: {
|
78
|
-
multi_match: {
|
79
|
-
query: q,
|
80
|
-
type: 'cross_fields',
|
81
|
-
fields: ['title', 'content'],
|
82
|
-
}
|
83
|
-
},
|
84
|
-
fields: ['frbr_uri', 'repealed', 'published_on', 'title', 'url', 'region_name'],
|
85
|
-
highlight: {
|
86
|
-
order: "score",
|
87
|
-
fields: {
|
88
|
-
content: {
|
89
|
-
fragment_size: 80,
|
90
|
-
number_of_fragments: 2,
|
91
|
-
},
|
92
|
-
title: {
|
93
|
-
number_of_fragments: 0, # entire field
|
94
|
-
}
|
95
|
-
},
|
96
|
-
pre_tags: ['<mark>'],
|
97
|
-
post_tags: ['</mark>'],
|
98
|
-
},
|
99
|
-
from: from,
|
100
|
-
size: size,
|
101
|
-
sort: {
|
102
|
-
'_score' => {order: 'desc'}
|
103
|
-
}
|
104
|
-
})
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|