slaw 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/Gemfile +0 -1
- data/README.md +25 -3
- data/Rakefile +2 -0
- data/lib/slaw.rb +2 -1
- data/lib/slaw/act.rb +101 -41
- data/lib/slaw/bylaw.rb +16 -23
- data/lib/slaw/collection.rb +30 -2
- data/lib/slaw/extract/extractor.rb +93 -0
- data/lib/slaw/parse/blocklists.rb +25 -17
- data/lib/slaw/parse/builder.rb +97 -22
- data/lib/slaw/parse/bylaw.treetop +2 -0
- data/lib/slaw/parse/cleanser.rb +14 -8
- data/lib/slaw/parse/grammar_helpers.rb +4 -0
- data/lib/slaw/render/html.rb +25 -8
- data/lib/slaw/render/xsl/elements.xsl +1 -1
- data/lib/slaw/version.rb +1 -1
- data/slaw.gemspec +0 -1
- data/spec/extract/extractor_spec.rb +14 -0
- data/spec/parse/builder_spec.rb +2 -0
- data/spec/parse/bylaw_spec.rb +2 -0
- data/spec/parse/cleanser_spec.rb +2 -0
- metadata +6 -17
- data/lib/slaw/elasticsearch.rb +0 -107
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 680301c5ade280eb7da5ea92c509f491631824f2
|
4
|
+
data.tar.gz: f2ddd5a99631121bf3693da5f229e38a6f590142
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 844130f24fa5e4e7e2acd8bacc9381bbd043591676a4fd22e9f1deec87e99b813f3062e4c4ec7286aca4ec0fe2a17161c39d85f5a07c8819192c82cd6203e474
|
7
|
+
data.tar.gz: de11ab3cb747c7341209e79f131506f6e2fc44065a73d95bb936c2b36b348646644024b0b657252106cd4c6d9f1b792ca4f7884e8f45fff4bda453da0a736cb7
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Slaw
|
1
|
+
# Slaw [](http://travis-ci.org/longhotsummer/slaw)
|
2
2
|
|
3
3
|
Slaw is a lightweight library for rendering and generating Akoma Ntoso acts from plain text and PDF documents.
|
4
4
|
It is used to power [openbylaws.org.za](http://openbylaws.org.za).
|
@@ -21,11 +21,33 @@ Or install it yourself as:
|
|
21
21
|
|
22
22
|
TODO: Write usage instructions here
|
23
23
|
|
24
|
+
### Extracting text from PDFs
|
25
|
+
|
26
|
+
You will need [xpdf](http://www.foolabs.com/xpdf/) to run PDF extraction. If you're
|
27
|
+
on a Mac you can use
|
28
|
+
|
29
|
+
brew install xpdf
|
30
|
+
|
31
|
+
Extracting PDFs often break lines in odd places (or doesn't break them when it should). Slaw gets around
|
32
|
+
this by running some cleanup routines on the extracted text.
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
extractor = Slaw::Extract::Extractor.new
|
36
|
+
|
37
|
+
# to guess the filetype by extension
|
38
|
+
text = extractor.extract_from_file('/path/to/file.pdf')
|
39
|
+
|
40
|
+
# or if you know it's a PDF
|
41
|
+
text = extractor.extract_from_pdf('/path/to/file.pdf')
|
42
|
+
|
43
|
+
# You can also "extract" text from a plain-text file
|
44
|
+
text = extractor.extract_from_text('/path/to/file.txt')
|
45
|
+
```
|
46
|
+
|
24
47
|
## Contributing
|
25
48
|
|
26
|
-
1. Fork it
|
49
|
+
1. Fork it at http://github.com/longhotsummer/slaw/fork
|
27
50
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
28
51
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
29
52
|
4. Push to the branch (`git push origin my-new-feature`)
|
30
53
|
5. Create new Pull Request
|
31
|
-
|
data/Rakefile
CHANGED
data/lib/slaw.rb
CHANGED
data/lib/slaw/act.rb
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
module Slaw
|
2
|
-
#
|
2
|
+
# An Act wraps a single {http://www.akomantoso.org/ AkomaNtoso 2.0 XML} act document in the form of a
|
3
|
+
# Nokogiri::XML::Document object.
|
4
|
+
#
|
5
|
+
# The Act object provides quick access to certain sections of the document,
|
6
|
+
# such as the metadata and the body, as well as common operations such as
|
7
|
+
# identifying whether it has been amended ({#amended?}), repealed
|
8
|
+
# ({#repealed?}) or what chapters ({#chapters}), parts ({#parts}) and
|
9
|
+
# sections ({#sections}) it contains.
|
3
10
|
class Act
|
4
11
|
include Slaw::Namespace
|
5
12
|
|
@@ -7,19 +14,45 @@ module Slaw
|
|
7
14
|
# Act instance itself
|
8
15
|
@@acts = {}
|
9
16
|
|
10
|
-
|
11
|
-
attr_accessor :
|
17
|
+
# [Nokogiri::XML::Document] The underlying {Nokogiri::XML::Document} instance
|
18
|
+
attr_accessor :doc
|
19
|
+
|
20
|
+
# [Nokogiri::XML::Node] The `meta` XML node
|
21
|
+
attr_accessor :meta
|
22
|
+
|
23
|
+
# [Nokogiri::XML::Node] The `body` XML node
|
24
|
+
attr_accessor :body
|
25
|
+
|
26
|
+
# [String] The year this act was published
|
27
|
+
attr_accessor :year
|
28
|
+
|
29
|
+
# [String] The act number in the year this act was published
|
30
|
+
attr_accessor :num
|
31
|
+
|
32
|
+
# [String] The FRBR URI of this act, which uniquely identifies it globally
|
33
|
+
attr_accessor :id_uri
|
34
|
+
|
35
|
+
# [String, nil] The source filename, or nil
|
36
|
+
attr_accessor :filename
|
37
|
+
|
38
|
+
# [Time, nil] The mtime of when the source file was last modified
|
39
|
+
attr_accessor :mtime
|
12
40
|
|
41
|
+
# Get the act that wraps the document that owns this XML node
|
42
|
+
# @param node [Nokogiri::XML::Node]
|
43
|
+
# @return [Act] owning act
|
13
44
|
def self.for_node(node)
|
14
45
|
@@acts[node.document]
|
15
46
|
end
|
16
47
|
|
17
|
-
# Create a new instance
|
48
|
+
# Create a new instance, loading from `filename` if given.
|
49
|
+
# @param filename [String] filename to load XML from
|
18
50
|
def initialize(filename=nil)
|
19
51
|
self.load(filename) if filename
|
20
52
|
end
|
21
53
|
|
22
|
-
# Load the XML
|
54
|
+
# Load the XML in `filename` into this instance
|
55
|
+
# @param filename [String] filename
|
23
56
|
def load(filename)
|
24
57
|
@filename = filename
|
25
58
|
@mtime = File::mtime(@filename)
|
@@ -27,7 +60,8 @@ module Slaw
|
|
27
60
|
File.open(filename) { |f| parse(f) }
|
28
61
|
end
|
29
62
|
|
30
|
-
# Parse the XML contained in the file-like object
|
63
|
+
# Parse the XML contained in the file-like object `io`
|
64
|
+
# @param io [file-like] io object with XML
|
31
65
|
def parse(io)
|
32
66
|
@doc = Nokogiri::XML(io)
|
33
67
|
@meta = @doc.at_xpath('/a:akomaNtoso/a:act/a:meta', a: NS)
|
@@ -35,10 +69,11 @@ module Slaw
|
|
35
69
|
|
36
70
|
@@acts[@doc] = self
|
37
71
|
|
38
|
-
|
72
|
+
_extract_id
|
39
73
|
end
|
40
74
|
|
41
|
-
|
75
|
+
# Parse the FRBR Uri into its constituent parts
|
76
|
+
def _extract_id
|
42
77
|
@id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
|
43
78
|
empty, @country, type, date, @num = @id_uri.split('/')
|
44
79
|
|
@@ -46,48 +81,39 @@ module Slaw
|
|
46
81
|
@year = date.split('-', 2)[0]
|
47
82
|
end
|
48
83
|
|
84
|
+
# An applicable short title for this act, either from the `FRBRalias` element
|
85
|
+
# or based on the act number and year.
|
86
|
+
# @return [String]
|
49
87
|
def short_title
|
50
|
-
|
51
|
-
|
52
|
-
if node
|
53
|
-
@short_title = node['value']
|
54
|
-
else
|
55
|
-
@short_title = "Act #{num} of #{year}"
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
@short_title
|
60
|
-
end
|
61
|
-
|
62
|
-
def url_path
|
63
|
-
"/#{@country}/acts/#{@year}/#{@num}/"
|
64
|
-
end
|
65
|
-
|
66
|
-
def url_file
|
67
|
-
"act-#{@year}-#{@num}"
|
88
|
+
node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
|
89
|
+
node ? node['value'] : "Act #{num} of #{year}"
|
68
90
|
end
|
69
91
|
|
70
|
-
# Has this act been amended?
|
92
|
+
# Has this act been amended? This is determined by testing the `contains`
|
93
|
+
# attribute of the `act` root element.
|
94
|
+
#
|
95
|
+
# @return [Boolean]
|
71
96
|
def amended?
|
72
97
|
@doc.at_xpath('/a:akomaNtoso/a:act', a: NS)['contains'] != 'originalVersion'
|
73
98
|
end
|
74
99
|
|
75
|
-
# a list of LifecycleEvent objects for amendment events, in date order
|
100
|
+
# Get a list of {Slaw::LifecycleEvent} objects for amendment events, in date order.
|
101
|
+
# @return [Array<Slaw::LifecycleEvent>] possibly empty list of lifecycle events
|
76
102
|
def amendment_events
|
77
103
|
@meta.xpath('./a:lifecycle/a:eventRef[@type="amendment"]', a: NS).map do |event|
|
78
104
|
LifecycleEvent.new(event)
|
79
105
|
end.sort_by { |e| e.date }
|
80
106
|
end
|
81
107
|
|
82
|
-
# Mark this act as being amended by another act, either
|
83
|
-
# or the details in
|
84
|
-
#
|
85
|
-
# :uri: uri of the amending act
|
86
|
-
# :title: title of the amending act
|
87
|
-
# :date: date of the amendment
|
108
|
+
# Mark this act as being amended by another act, either `act`
|
109
|
+
# or the details in `opts`.
|
88
110
|
#
|
89
111
|
# It is assumed that there can be only one amendment event on a particular
|
90
112
|
# date. An existing amendment on this date is overwritten.
|
113
|
+
#
|
114
|
+
# @option opts [String] :uri uri of the amending act
|
115
|
+
# @option opts [String] :title title of the amending act
|
116
|
+
# @option opts [String] :date date of the amendment (YYYY-MM-DD)
|
91
117
|
def amended_by!(act, opts={})
|
92
118
|
if act
|
93
119
|
opts[:uri] ||= act.id_uri
|
@@ -133,27 +159,40 @@ module Slaw
|
|
133
159
|
end
|
134
160
|
|
135
161
|
# Does this Act have parts?
|
162
|
+
# @return [Boolean]
|
136
163
|
def parts?
|
137
164
|
!parts.empty?
|
138
165
|
end
|
139
166
|
|
167
|
+
# Top-level parts of this act. Parts inside chapters are ignored.
|
168
|
+
# @return [Array<Nokogiri::XML::Node>] part nodes
|
140
169
|
def parts
|
141
170
|
@body.xpath('./a:part', a: NS)
|
142
171
|
end
|
143
172
|
|
173
|
+
# Does this Act have chapters?
|
174
|
+
# @return [Boolean]
|
144
175
|
def chapters?
|
145
176
|
!chapters.empty?
|
146
177
|
end
|
147
178
|
|
179
|
+
# Top-level chapters of this act. Chapters inside parts are ignored.
|
180
|
+
# @return [Array<Nokogiri::XML::Node>] chapter nodes
|
148
181
|
def chapters
|
149
182
|
@body.xpath('./a:chapter', a: NS)
|
150
183
|
end
|
151
184
|
|
185
|
+
# Sections of this act
|
186
|
+
# @return [Array<Nokogiri::XML::Node>] section nodes
|
152
187
|
def sections
|
153
188
|
@body.xpath('.//a:section', a: NS)
|
154
189
|
end
|
155
190
|
|
156
|
-
# The
|
191
|
+
# The primary definitions section of this act, identified by
|
192
|
+
# either an `id` of `definitions` or the first section with a heading
|
193
|
+
# of `Definitions`.
|
194
|
+
#
|
195
|
+
# @return [Nokogiri::XML::Node, nil] definitions node or nil
|
157
196
|
def definitions
|
158
197
|
# try looking for the definition list
|
159
198
|
defn = @body.at_css('#definitions')
|
@@ -166,14 +205,21 @@ module Slaw
|
|
166
205
|
nil
|
167
206
|
end
|
168
207
|
|
169
|
-
#
|
208
|
+
# An act can contain schedules, additional (generally free-form) documents
|
209
|
+
# that are addendums to the the main body. A definition element must be
|
210
|
+
# part of a separate `component` and have a `doc` element with a name attribute
|
211
|
+
# of `schedules`.
|
212
|
+
#
|
213
|
+
# @return [Nokogiri::XML::Node, nil] schedules document node
|
170
214
|
def schedules
|
171
215
|
@doc.at_xpath('/a:akomaNtoso/a:components/a:component/a:doc[@name="schedules"]/a:mainBody', a: NS)
|
172
216
|
end
|
173
217
|
|
174
|
-
# Get a map from term ids to
|
175
|
-
# where
|
176
|
-
# the XML
|
218
|
+
# Get a map from term ids to `[term, defn]` pairs,
|
219
|
+
# where `term+ is the plain text term and `defn` is
|
220
|
+
# the {Nokogiri::XML::Node} containing the definition.
|
221
|
+
#
|
222
|
+
# @return {String => List(String, Nokogiri::XML::Node)} map from strings to `[term, definition]` pairs
|
177
223
|
def term_definitions
|
178
224
|
terms = {}
|
179
225
|
|
@@ -191,23 +237,31 @@ module Slaw
|
|
191
237
|
end
|
192
238
|
|
193
239
|
# Returns the publication element, if any.
|
240
|
+
#
|
241
|
+
# @return [Nokogiri::XML::Node, nil]
|
194
242
|
def publication
|
195
243
|
@meta.at_xpath('./a:publication', a: NS)
|
196
244
|
end
|
197
245
|
|
198
246
|
# Has this by-law been repealed?
|
247
|
+
#
|
248
|
+
# @return [Boolean]
|
199
249
|
def repealed?
|
200
250
|
!!repealed_on
|
201
251
|
end
|
202
252
|
|
203
253
|
# The date on which this act was repealed, or nil if never repealed
|
254
|
+
#
|
255
|
+
# @return [String] date of repeal or nil
|
204
256
|
def repealed_on
|
205
257
|
repeal_el = repeal
|
206
258
|
repeal_el ? Time.parse(repeal_el['date']) : nil
|
207
259
|
end
|
208
260
|
|
209
261
|
# The element representing the reference that caused the repeal of this
|
210
|
-
# act, or nil
|
262
|
+
# act, or nil.
|
263
|
+
#
|
264
|
+
# @return [Nokogiri::XML::Node] element of reference to repealing act, or nil
|
211
265
|
def repealed_by
|
212
266
|
repeal_el = repeal
|
213
267
|
return nil unless repeal_el
|
@@ -216,7 +270,9 @@ module Slaw
|
|
216
270
|
@meta.at_xpath("./a:references/a:passiveRef[@id='#{source_id}']", a: NS)
|
217
271
|
end
|
218
272
|
|
219
|
-
# The XML element representing the repeal of this act, or nil
|
273
|
+
# The XML element representing the event of repeal of this act, or nil
|
274
|
+
#
|
275
|
+
# @return [Nokogiri::XML::Node]
|
220
276
|
def repeal
|
221
277
|
# <lifecycle source="#this">
|
222
278
|
# <eventRef id="e1" date="2010-07-28" source="#original" type="generation"/>
|
@@ -226,11 +282,15 @@ module Slaw
|
|
226
282
|
@meta.at_xpath('./a:lifecycle/a:eventRef[@type="repeal"]', a: NS)
|
227
283
|
end
|
228
284
|
|
285
|
+
# The date at which this particular XML manifestation of this document was generated.
|
286
|
+
#
|
287
|
+
# @return [String] date, YYYY-MM-DD
|
229
288
|
def manifestation_date
|
230
289
|
node = @meta.at_xpath('./a:identification/a:FRBRManifestation/a:FRBRdate[@name="Generation"]', a: NS)
|
231
290
|
node && node['date']
|
232
291
|
end
|
233
292
|
|
293
|
+
# The underlying nature of this act, usually `act` although subclasses my override this.
|
234
294
|
def nature
|
235
295
|
"act"
|
236
296
|
end
|
data/lib/slaw/bylaw.rb
CHANGED
@@ -1,12 +1,19 @@
|
|
1
1
|
require 'slaw/act'
|
2
2
|
|
3
3
|
module Slaw
|
4
|
-
#
|
4
|
+
# An extension of {Slaw::Act} which wraps an AkomaNtoso XML document describing an By-Law.
|
5
|
+
#
|
6
|
+
# There are minor differences between Acts and By-laws, the most notable being that a by-law
|
7
|
+
# is not identified by a year and a number, and therefore has a different FRBR uri structure.
|
5
8
|
class ByLaw < Act
|
6
9
|
|
7
|
-
|
10
|
+
# [String] The region this by-law applies to
|
11
|
+
attr_accessor :region
|
12
|
+
|
13
|
+
# [String] A short file-like name of this by-law, unique within its year and region
|
14
|
+
attr_accessor :name
|
8
15
|
|
9
|
-
def
|
16
|
+
def _extract_id
|
10
17
|
# /za/by-law/cape-town/2010/public-parks
|
11
18
|
|
12
19
|
@id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
|
@@ -22,30 +29,16 @@ module Slaw
|
|
22
29
|
end
|
23
30
|
|
24
31
|
def short_title
|
25
|
-
|
26
|
-
|
27
|
-
if node
|
28
|
-
@short_title = node['value']
|
29
|
-
else
|
30
|
-
@short_title = "(Unknown)"
|
31
|
-
end
|
32
|
-
|
33
|
-
if amended? and not @short_title.end_with?("as amended")
|
34
|
-
@short_title = @short_title + " as amended"
|
35
|
-
end
|
36
|
-
end
|
32
|
+
node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
|
33
|
+
short_title = node ? node['value'] : "(Unknown)"
|
37
34
|
|
38
|
-
|
39
|
-
|
35
|
+
if amended? and not short_title.end_with?("as amended")
|
36
|
+
short_title = short_title + " as amended"
|
37
|
+
end
|
40
38
|
|
41
|
-
|
42
|
-
"/#{@country}/by-law/#{@region}/#{@year}/#{@name}/"
|
39
|
+
short_title
|
43
40
|
end
|
44
41
|
|
45
|
-
def url_file
|
46
|
-
@name
|
47
|
-
end
|
48
|
-
|
49
42
|
def nature
|
50
43
|
"by-law"
|
51
44
|
end
|
data/lib/slaw/collection.rb
CHANGED
@@ -2,11 +2,28 @@ require 'forwardable'
|
|
2
2
|
|
3
3
|
module Slaw
|
4
4
|
# A collection of Act instances.
|
5
|
+
#
|
6
|
+
# This is useful for looking up acts by their FRBR uri and for
|
7
|
+
# loading a collection of XML act documents.
|
8
|
+
#
|
9
|
+
# This collection is enumerable and can be iterated over. Use {#items} to
|
10
|
+
# access the underlying array of objects.
|
11
|
+
#
|
12
|
+
# @example Load a collection of acts and then iterate over them.
|
13
|
+
#
|
14
|
+
# acts = Slaw::DocumentCollection.new
|
15
|
+
# acts.discover('/path/to/acts/')
|
16
|
+
#
|
17
|
+
# for act in acts
|
18
|
+
# puts act.short_name
|
19
|
+
# end
|
20
|
+
#
|
5
21
|
class DocumentCollection
|
6
22
|
|
7
23
|
include Enumerable
|
8
24
|
extend Forwardable
|
9
25
|
|
26
|
+
# [Array<Act>] The underlying array of acts
|
10
27
|
attr_accessor :items
|
11
28
|
|
12
29
|
def_delegators :items, :each, :<<, :length
|
@@ -15,16 +32,27 @@ module Slaw
|
|
15
32
|
@items = items || []
|
16
33
|
end
|
17
34
|
|
18
|
-
# Find all XML files in
|
19
|
-
#
|
35
|
+
# Find all XML files in `path` and add them into this
|
36
|
+
# collection.
|
37
|
+
#
|
38
|
+
# @param path [String] the path to glob for xml files
|
39
|
+
# @param cls [Class] the class to instantiate for each file
|
40
|
+
#
|
41
|
+
# @return [DocumentCollection] this collection
|
20
42
|
def discover(path, cls=Slaw::Act)
|
21
43
|
for fname in Dir.glob("#{path}/**/*.xml")
|
22
44
|
@items << cls.new(fname)
|
23
45
|
end
|
46
|
+
|
47
|
+
self
|
24
48
|
end
|
25
49
|
|
26
50
|
# Try to find an act who's FRBRuri matches this one,
|
27
51
|
# returning nil on failure
|
52
|
+
#
|
53
|
+
# @param uri [String] the uri to look for
|
54
|
+
#
|
55
|
+
# @return [Act, nil] the act, or nil
|
28
56
|
def for_uri(uri)
|
29
57
|
return @items.find { |doc| doc.id_uri == uri }
|
30
58
|
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module Slaw
|
4
|
+
module Extract
|
5
|
+
|
6
|
+
# Routines for extracting and cleaning up context from other formats, such as PDF.
|
7
|
+
#
|
8
|
+
# You may need to set the location of the `pdftotext` binary.
|
9
|
+
#
|
10
|
+
# On Mac OS X, use `brew install xpdf` or download from http://www.foolabs.com/xpdf/download.html
|
11
|
+
#
|
12
|
+
# On Heroku, you'll need to do some hoop jumping, see http://theprogrammingbutler.com/blog/archives/2011/07/28/running-pdftotext-on-heroku/
|
13
|
+
class Extractor
|
14
|
+
include Slaw::Logging
|
15
|
+
|
16
|
+
@@pdftotext_path = "pdftotext"
|
17
|
+
|
18
|
+
# Object with text cleaning helpers
|
19
|
+
attr_accessor :cleanser
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@cleanser = Slaw::Parse::Cleanser.new
|
23
|
+
end
|
24
|
+
|
25
|
+
# Extract text from a file and run cleanup on it.
|
26
|
+
#
|
27
|
+
# @param filename [String] filename to extract from
|
28
|
+
#
|
29
|
+
# @return [String] extracted text
|
30
|
+
def extract_from_file(filename)
|
31
|
+
ext = filename[-4..-1].downcase
|
32
|
+
|
33
|
+
case ext
|
34
|
+
when '.pdf'
|
35
|
+
extract_from_pdf(filename)
|
36
|
+
when '.txt'
|
37
|
+
extract_from_text(filename)
|
38
|
+
else
|
39
|
+
raise ArgumentError.new("Unsupported file type #{ext}")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Extract text from a PDF
|
44
|
+
#
|
45
|
+
# @param filename [String] filename to extract from
|
46
|
+
#
|
47
|
+
# @return [String] extracted text
|
48
|
+
def extract_from_pdf(filename)
|
49
|
+
cmd = pdf_to_text_cmd(filename)
|
50
|
+
logger.info("Executing: #{cmd}")
|
51
|
+
stdout, status = Open3.capture2(*cmd)
|
52
|
+
|
53
|
+
if status == 0
|
54
|
+
cleanup(stdout)
|
55
|
+
else
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Build a command for the external PDF-to-text utility.
|
61
|
+
#
|
62
|
+
# @param filename [String] the pdf file
|
63
|
+
#
|
64
|
+
# @return [Array<String>] command and params to execute
|
65
|
+
def pdf_to_text_cmd(filename)
|
66
|
+
[Extractor.pdftotext_path, "-enc", "UTF-8", filename, "-"]
|
67
|
+
end
|
68
|
+
|
69
|
+
def extract_from_text(filename)
|
70
|
+
cleanup(File.read(filename))
|
71
|
+
end
|
72
|
+
|
73
|
+
# Run general once-off cleanup of extracted text.
|
74
|
+
def cleanup(text)
|
75
|
+
text = @cleanser.cleanup(text)
|
76
|
+
text = @cleanser.remove_empty_lines(text)
|
77
|
+
text = @cleanser.reformat(text)
|
78
|
+
|
79
|
+
text
|
80
|
+
end
|
81
|
+
|
82
|
+
# Get location of the pdftotext executable for all instances.
|
83
|
+
def self.pdftotext_path
|
84
|
+
@@pdftotext_path
|
85
|
+
end
|
86
|
+
|
87
|
+
# Set location of the pdftotext executable for all instances.
|
88
|
+
def self.pdftotext_path=(val)
|
89
|
+
@@pdftotext_path = val
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -3,28 +3,36 @@ module Slaw
|
|
3
3
|
module Blocklists
|
4
4
|
include Slaw::Namespace
|
5
5
|
|
6
|
-
# Correctly re-nest nested block lists.
|
6
|
+
# Correctly re-nest nested block lists. We do this by identifying the
|
7
|
+
# numbering format of each item in the list and comparing it with the
|
8
|
+
# surrounding elements. When the numbering format changes, we start
|
9
|
+
# a new nested list.
|
7
10
|
#
|
8
|
-
# (
|
9
|
-
# (
|
10
|
-
#
|
11
|
-
# (ii)
|
12
|
-
# (aa)
|
13
|
-
# (bb)
|
14
|
-
# (c)
|
15
|
-
# (d)
|
11
|
+
# We make sure to handle special cases such as `(i)` coming between
|
12
|
+
# `(h)` and `(j)` versus being at the start of a `(i), (ii), (iii)`
|
13
|
+
# list.
|
16
14
|
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
# (i)
|
22
|
-
# (ii)
|
15
|
+
# (a)
|
16
|
+
# (b)
|
17
|
+
# (i)
|
18
|
+
# (ii)
|
23
19
|
# (aa)
|
24
20
|
# (bb)
|
25
|
-
#
|
26
|
-
#
|
21
|
+
# (c)
|
22
|
+
# (d)
|
23
|
+
#
|
24
|
+
# becomes
|
25
|
+
#
|
26
|
+
# (a)
|
27
|
+
# (b)
|
28
|
+
# (i)
|
29
|
+
# (ii)
|
30
|
+
# (aa)
|
31
|
+
# (bb)
|
32
|
+
# (c)
|
33
|
+
# (d)
|
27
34
|
#
|
35
|
+
# @param doc [Nokogiri::XML::Document] the document
|
28
36
|
def self.nest_blocklists(doc)
|
29
37
|
doc.xpath('//a:blockList', a: NS).each do |blocklist|
|
30
38
|
items = blocklist.xpath('a:item', a: NS)
|
data/lib/slaw/parse/builder.rb
CHANGED
@@ -1,25 +1,67 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require 'builder'
|
2
4
|
require 'treetop'
|
3
5
|
|
4
6
|
module Slaw
|
5
7
|
module Parse
|
6
|
-
#
|
8
|
+
# The primary class for building Akoma Ntoso documents from plain text documents.
|
9
|
+
#
|
10
|
+
# The builder uses a grammar to break down a plain-text version of an act into a
|
11
|
+
# syntax tree. This tree can then be serialized into an Akoma Ntoso compatible
|
12
|
+
# XML document.
|
13
|
+
#
|
14
|
+
# @example Parse some text into a well-formed document
|
15
|
+
# builder = Slaw::Builder.new
|
16
|
+
# xml = builder.parse_text(text)
|
17
|
+
# doc = builder.parse_xml(xml)
|
18
|
+
# builder.postprocess(doc)
|
19
|
+
#
|
20
|
+
# @example A quicker way to build a well-formed document
|
21
|
+
# builder = Slaw::Builder.new
|
22
|
+
# doc = builder.parse_and_process_text(text)
|
7
23
|
#
|
8
|
-
# It can convert from plain text a new Akoma Ntoso document, or
|
9
|
-
# update existing documents.
|
10
24
|
class Builder
|
11
25
|
include Slaw::Namespace
|
12
26
|
include Slaw::Logging
|
13
27
|
|
14
28
|
Treetop.load(File.dirname(__FILE__) + "/bylaw.treetop")
|
15
29
|
|
30
|
+
# [Hash] A Hash of options that are made available to the parser when parsing.
|
16
31
|
attr_accessor :parse_options
|
17
32
|
|
18
|
-
def initialize()
|
19
|
-
@parse_options =
|
33
|
+
def initialize(parse_options={})
|
34
|
+
@parse_options = parse_options
|
35
|
+
end
|
36
|
+
|
37
|
+
# Do all the work necessary to parse text into a well-formed XML document.
|
38
|
+
#
|
39
|
+
# @param text [String] the text to parse
|
40
|
+
# @param root [Symbol] the root element of the grammar
|
41
|
+
#
|
42
|
+
# @return [Nokogiri::XML::Document] a well formed document
|
43
|
+
def parse_and_process_text(text, root=:bylaw)
|
44
|
+
postprocess(parse_xml(parse_text(text, root)))
|
45
|
+
end
|
46
|
+
|
47
|
+
# Parse text into XML. You should still run {#postprocess} on the
|
48
|
+
# resulting XML to normalise it.
|
49
|
+
#
|
50
|
+
# @param text [String] the text to parse
|
51
|
+
# @param root [Symbol] the root element of the grammar
|
52
|
+
#
|
53
|
+
# @return [String] an XML string
|
54
|
+
def parse_text(text, root=:bylaw)
|
55
|
+
tree = text_to_syntax_tree(text, root)
|
56
|
+
xml_from_syntax_tree(tree)
|
20
57
|
end
|
21
58
|
|
22
|
-
#
|
59
|
+
# Parse plain text into a syntax tree.
|
60
|
+
#
|
61
|
+
# @param text [String] the text to parse
|
62
|
+
# @param root [Symbol] the root element of the grammar
|
63
|
+
#
|
64
|
+
# @return [Object] the root of the resulting parse tree, usually a Treetop::Node object
|
23
65
|
def text_to_syntax_tree(text, root=:bylaw)
|
24
66
|
parser = Slaw::Parse::BylawParser.new
|
25
67
|
parser.options = @parse_options
|
@@ -35,7 +77,12 @@ module Slaw
|
|
35
77
|
tree
|
36
78
|
end
|
37
79
|
|
38
|
-
# Generate an XML document from the given syntax tree.
|
80
|
+
# Generate an XML document from the given syntax tree. You should still
|
81
|
+
# run {#postprocess} on the resulting XML to normalise it.
|
82
|
+
#
|
83
|
+
# @param tree [Object] a Treetop::Node object
|
84
|
+
#
|
85
|
+
# @return [String] an XML string
|
39
86
|
def xml_from_syntax_tree(tree)
|
40
87
|
s = ""
|
41
88
|
builder = ::Builder::XmlMarkup.new(indent: 2, target: s)
|
@@ -50,38 +97,41 @@ module Slaw
|
|
50
97
|
s
|
51
98
|
end
|
52
99
|
|
100
|
+
# Parse a string into a Nokogiri::XML::Document
|
101
|
+
#
|
102
|
+
# @param xml [String] string to parse
|
103
|
+
#
|
104
|
+
# @return [Nokogiri::XML::Document]
|
53
105
|
def parse_xml(xml)
|
54
106
|
Nokogiri::XML(xml, &:noblanks)
|
55
107
|
end
|
56
108
|
|
109
|
+
# Serialise a Nokogiri::XML::Document into a string
|
110
|
+
#
|
111
|
+
# @param doc [Nokogiri::XML::Document] document
|
112
|
+
#
|
113
|
+
# @return [String] pretty printed string
|
57
114
|
def to_xml(doc)
|
58
115
|
doc.to_xml(indent: 2)
|
59
116
|
end
|
60
117
|
|
61
|
-
#
|
62
|
-
#
|
118
|
+
# Postprocess an XML document.
|
119
|
+
#
|
120
|
+
# @param doc [Nokogiri::XML::Document]
|
121
|
+
#
|
122
|
+
# @return [Nokogiri::XML::Document] the updated document
|
63
123
|
def postprocess(doc)
|
64
124
|
normalise_headings(doc)
|
65
125
|
find_short_title(doc)
|
66
|
-
sanitise(doc)
|
67
|
-
end
|
68
|
-
|
69
|
-
# Do sanitisations, such as finding and linking definitions
|
70
|
-
def sanitise(doc)
|
71
126
|
link_definitions(doc)
|
72
127
|
nest_blocklists(doc)
|
73
|
-
end
|
74
128
|
|
75
|
-
|
76
|
-
def renumber_terms(doc)
|
77
|
-
logger.info("Renumbering terms")
|
78
|
-
|
79
|
-
doc.xpath('//a:term', a: NS).each_with_index do |term, i|
|
80
|
-
term['id'] = "trm#{i}"
|
81
|
-
end
|
129
|
+
doc
|
82
130
|
end
|
83
131
|
|
84
132
|
# Change CAPCASE headings into Sentence case.
|
133
|
+
#
|
134
|
+
# @param doc [Nokogiri::XML::Document]
|
85
135
|
def normalise_headings(doc)
|
86
136
|
logger.info("Normalising headings")
|
87
137
|
|
@@ -94,6 +144,8 @@ module Slaw
|
|
94
144
|
end
|
95
145
|
|
96
146
|
# Find the short title and add it as an FRBRalias element in the meta section
|
147
|
+
#
|
148
|
+
# @param doc [Nokogiri::XML::Document]
|
97
149
|
def find_short_title(doc)
|
98
150
|
logger.info("Finding short title")
|
99
151
|
|
@@ -117,6 +169,8 @@ module Slaw
|
|
117
169
|
|
118
170
|
# Find definitions of terms and introduce them into the
|
119
171
|
# meta section of the document.
|
172
|
+
#
|
173
|
+
# @param doc [Nokogiri::XML::Document]
|
120
174
|
def link_definitions(doc)
|
121
175
|
logger.info("Finding and linking definitions")
|
122
176
|
|
@@ -126,6 +180,12 @@ module Slaw
|
|
126
180
|
renumber_terms(doc)
|
127
181
|
end
|
128
182
|
|
183
|
+
# Find `def` elements in the document and return a Hash from
|
184
|
+
# term ids to the text of each term
|
185
|
+
#
|
186
|
+
# @param doc [Nokogiri::XML::Document]
|
187
|
+
#
|
188
|
+
# @return [Hash{String, String}]
|
129
189
|
def find_definitions(doc)
|
130
190
|
guess_at_definitions(doc)
|
131
191
|
|
@@ -239,6 +299,21 @@ module Slaw
|
|
239
299
|
end
|
240
300
|
end
|
241
301
|
|
302
|
+
# recalculate ids for <term> elements
|
303
|
+
def renumber_terms(doc)
|
304
|
+
logger.info("Renumbering terms")
|
305
|
+
|
306
|
+
doc.xpath('//a:term', a: NS).each_with_index do |term, i|
|
307
|
+
term['id'] = "trm#{i}"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
# Correctly nest blocklists.
|
312
|
+
#
|
313
|
+
# The grammar gives us flat blocklists, we need to introspect the
|
314
|
+
# numbering of the lists to correctly nest them.
|
315
|
+
#
|
316
|
+
# @param doc [Nokogiri::XML::Document]
|
242
317
|
def nest_blocklists(doc)
|
243
318
|
logger.info("Nesting blocklists")
|
244
319
|
|
data/lib/slaw/parse/cleanser.rb
CHANGED
@@ -50,16 +50,17 @@ module Slaw
|
|
50
50
|
.gsub("", '')
|
51
51
|
end
|
52
52
|
|
53
|
+
# change weird quotes to normal ones
|
53
54
|
def fix_quotes(s)
|
54
|
-
# change weird quotes to normal ones
|
55
55
|
s.gsub(/‘‘|’’|''/, '"')
|
56
56
|
end
|
57
57
|
|
58
|
+
# tabs to spaces
|
58
59
|
def expand_tabs(s)
|
59
|
-
# tabs to spaces
|
60
60
|
s.gsub(/\t/, ' ')
|
61
61
|
end
|
62
62
|
|
63
|
+
# Try to remove boilerplate lines found in many files, such as page numbers.
|
63
64
|
def remove_boilerplate(s)
|
64
65
|
# nuke any line to do with Sabinet and the government printer
|
65
66
|
s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
|
@@ -72,6 +73,8 @@ module Slaw
|
|
72
73
|
.gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
|
73
74
|
end
|
74
75
|
|
76
|
+
# Get rid of whitespace at the end of lines and at the start and end of the
|
77
|
+
# entire string.
|
75
78
|
def chomp(s)
|
76
79
|
# trailing whitespace at end of lines
|
77
80
|
s = s.gsub(/ +$/, '')
|
@@ -85,8 +88,11 @@ module Slaw
|
|
85
88
|
s.end_with?("\n") ? s : (s + "\n")
|
86
89
|
end
|
87
90
|
|
88
|
-
#
|
89
|
-
# have been broken but haven't, and break them
|
91
|
+
# Make educated guesses about lines that should
|
92
|
+
# have been broken but haven't, and break them.
|
93
|
+
#
|
94
|
+
# This is very dependent on a locale's legislation grammar, there are
|
95
|
+
# lots of rules of thumb that make this work.
|
90
96
|
def break_lines(s)
|
91
97
|
# often we find a section title munged onto the same line as its first statement
|
92
98
|
# eg:
|
@@ -115,8 +121,8 @@ module Slaw
|
|
115
121
|
s
|
116
122
|
end
|
117
123
|
|
118
|
-
#
|
119
|
-
# and
|
124
|
+
# Find likely candidates for unnecessarily broken lines
|
125
|
+
# and unbreaks them.
|
120
126
|
def unbreak_lines(s)
|
121
127
|
lines = s.split(/\n/)
|
122
128
|
output = []
|
@@ -141,8 +147,8 @@ module Slaw
|
|
141
147
|
output.join("\n")
|
142
148
|
end
|
143
149
|
|
144
|
-
#
|
145
|
-
# it really confuses the grammer
|
150
|
+
# Do our best to remove table of contents at the start,
|
151
|
+
# it really confuses the grammer.
|
146
152
|
def strip_toc(s)
|
147
153
|
# first, try to find 'TABLE OF CONTENTS' anywhere within the first 4K of text,
|
148
154
|
if toc_start = s[0..4096].match(/TABLE OF CONTENTS/i)
|
data/lib/slaw/render/html.rb
CHANGED
@@ -2,7 +2,14 @@ module Slaw
|
|
2
2
|
module Render
|
3
3
|
|
4
4
|
# Support for transforming XML AN documents into HTML.
|
5
|
+
#
|
6
|
+
# This rendering is done using XSLT stylesheets. Both an entire
|
7
|
+
# document and fragments can be rendered.
|
5
8
|
class HTMLRenderer
|
9
|
+
|
10
|
+
# [Hash] A Hash of Nokogiri::XSLT objects
|
11
|
+
attr_accessor :xslt
|
12
|
+
|
6
13
|
def initialize
|
7
14
|
here = File.dirname(__FILE__)
|
8
15
|
|
@@ -12,12 +19,17 @@ module Slaw
|
|
12
19
|
}
|
13
20
|
end
|
14
21
|
|
15
|
-
# Transform an entire XML document
|
16
|
-
# Specify
|
22
|
+
# Transform an entire XML document (a Nokogiri::XML::Document object) into HTML.
|
23
|
+
# Specify `base_url` to manage the base for relative URLs generated by
|
17
24
|
# the transform.
|
25
|
+
#
|
26
|
+
# @param doc [Nokogiri::XML::Document] document to render
|
27
|
+
# @param base_url [String] root URL for relative URLs (cannot be empty)
|
28
|
+
#
|
29
|
+
# @return [String]
|
18
30
|
def render(doc, base_url='')
|
19
|
-
params =
|
20
|
-
|
31
|
+
params = _transform_params({'base_url' => base_url})
|
32
|
+
_run_xslt(:act, doc, params)
|
21
33
|
end
|
22
34
|
|
23
35
|
# Transform just a single node and its children into HTML.
|
@@ -25,8 +37,13 @@ module Slaw
|
|
25
37
|
# If +elem+ has an id, we use xpath to tell the XSLT which
|
26
38
|
# element to transform. Otherwise we copy the node into a new
|
27
39
|
# tree and apply the XSLT to that.
|
40
|
+
#
|
41
|
+
# @param node [Nokogiri::XML::Node] node to render
|
42
|
+
# @param base_url [String] root URL for relative URLs (cannot be empty)
|
43
|
+
#
|
44
|
+
# @return [String]
|
28
45
|
def render_node(node, base_url='')
|
29
|
-
params =
|
46
|
+
params = _transform_params({'base_url' => base_url})
|
30
47
|
|
31
48
|
if node.id
|
32
49
|
params += ['root_elem', "//*[@id='#{node.id}']"]
|
@@ -38,14 +55,14 @@ module Slaw
|
|
38
55
|
params += ['root_elem', '*']
|
39
56
|
end
|
40
57
|
|
41
|
-
|
58
|
+
_run_xslt(:fragment, doc, params)
|
42
59
|
end
|
43
60
|
|
44
|
-
def
|
61
|
+
def _run_xslt(xslt, doc, params)
|
45
62
|
@xslt[xslt].transform(doc, params).to_s
|
46
63
|
end
|
47
64
|
|
48
|
-
def
|
65
|
+
def _transform_params(params)
|
49
66
|
Nokogiri::XSLT.quote_params(params)
|
50
67
|
end
|
51
68
|
end
|
@@ -77,7 +77,7 @@
|
|
77
77
|
<xsl:value-of select="@refersTo" />
|
78
78
|
</xsl:attribute>
|
79
79
|
|
80
|
-
<xsl:attribute name="href"><xsl:value-of select="$base_url"
|
80
|
+
<xsl:attribute name="href"><xsl:value-of select="$base_url" />/definitions/#def-<xsl:value-of select="translate(@refersTo, '#', '')" /></xsl:attribute>
|
81
81
|
|
82
82
|
<xsl:apply-templates />
|
83
83
|
</a>
|
data/lib/slaw/version.rb
CHANGED
data/slaw.gemspec
CHANGED
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency "rspec", "~> 2.14.1"
|
24
24
|
|
25
25
|
spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
|
26
|
-
spec.add_runtime_dependency "elasticsearch", "~> 1.0.5"
|
27
26
|
spec.add_runtime_dependency "treetop", "~> 1.5"
|
28
27
|
spec.add_runtime_dependency "builder", "~> 3.2.2"
|
29
28
|
spec.add_runtime_dependency "log4r", "~> 1.1.10"
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'slaw'
|
5
|
+
|
6
|
+
describe Slaw::Extract::Extractor do
|
7
|
+
it 'should extract from plain text' do
|
8
|
+
f = Tempfile.new(['test', '.txt'])
|
9
|
+
f.write('This is some text')
|
10
|
+
f.rewind
|
11
|
+
|
12
|
+
subject.extract_from_file(f.path).should == "This is some text\n"
|
13
|
+
end
|
14
|
+
end
|
data/spec/parse/builder_spec.rb
CHANGED
data/spec/parse/bylaw_spec.rb
CHANGED
data/spec/parse/cleanser_spec.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,20 +66,6 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 1.6.0
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: elasticsearch
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - "~>"
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 1.0.5
|
76
|
-
type: :runtime
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - "~>"
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: 1.0.5
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: treetop
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -131,6 +117,7 @@ extensions: []
|
|
131
117
|
extra_rdoc_files: []
|
132
118
|
files:
|
133
119
|
- ".gitignore"
|
120
|
+
- ".travis.yml"
|
134
121
|
- Gemfile
|
135
122
|
- LICENSE.txt
|
136
123
|
- README.md
|
@@ -139,7 +126,7 @@ files:
|
|
139
126
|
- lib/slaw/act.rb
|
140
127
|
- lib/slaw/bylaw.rb
|
141
128
|
- lib/slaw/collection.rb
|
142
|
-
- lib/slaw/
|
129
|
+
- lib/slaw/extract/extractor.rb
|
143
130
|
- lib/slaw/lifecycle_event.rb
|
144
131
|
- lib/slaw/logging.rb
|
145
132
|
- lib/slaw/namespace.rb
|
@@ -157,6 +144,7 @@ files:
|
|
157
144
|
- lib/slaw/version.rb
|
158
145
|
- lib/slaw/xml_support.rb
|
159
146
|
- slaw.gemspec
|
147
|
+
- spec/extract/extractor_spec.rb
|
160
148
|
- spec/parse/builder_spec.rb
|
161
149
|
- spec/parse/bylaw_spec.rb
|
162
150
|
- spec/parse/cleanser_spec.rb
|
@@ -187,6 +175,7 @@ signing_key:
|
|
187
175
|
specification_version: 4
|
188
176
|
summary: A lightweight library for using Akoma Ntoso acts in Ruby.
|
189
177
|
test_files:
|
178
|
+
- spec/extract/extractor_spec.rb
|
190
179
|
- spec/parse/builder_spec.rb
|
191
180
|
- spec/parse/bylaw_spec.rb
|
192
181
|
- spec/parse/cleanser_spec.rb
|
data/lib/slaw/elasticsearch.rb
DELETED
@@ -1,107 +0,0 @@
|
|
1
|
-
require 'elasticsearch'
|
2
|
-
require 'log4r'
|
3
|
-
|
4
|
-
module Slaw
|
5
|
-
# Support for indexing and search using elasticsearch
|
6
|
-
class ElasticSearchSupport
|
7
|
-
attr_accessor :es, :mapping, :index, :type, :base_url
|
8
|
-
|
9
|
-
def initialize(index, type, base_url, client_params={}, es=nil)
|
10
|
-
@es = es || create_client(client_params)
|
11
|
-
|
12
|
-
@ix = index
|
13
|
-
@type = type
|
14
|
-
@base_url = base_url
|
15
|
-
|
16
|
-
@mapping = {
|
17
|
-
frbr_uri: {type: 'string', index: 'not_analyzed'},
|
18
|
-
url: {type: 'string', index: 'not_analyzed'},
|
19
|
-
title: {type: 'string', analyzer: 'english'},
|
20
|
-
content: {type: 'string', analyzer: 'english'},
|
21
|
-
published_on: {type: 'date', format: 'dateOptionalTime'},
|
22
|
-
region: {type: 'string', index: 'not_analyzed'},
|
23
|
-
region_name: {type: 'string', index: 'not_analyzed'},
|
24
|
-
repealed: {type: 'boolean'},
|
25
|
-
}
|
26
|
-
|
27
|
-
@log = Log4r::Logger['Slaw']
|
28
|
-
end
|
29
|
-
|
30
|
-
def create_client(client_params)
|
31
|
-
Elasticsearch::Client.new(client_params)
|
32
|
-
end
|
33
|
-
|
34
|
-
def reindex!(docs, &block)
|
35
|
-
define_mapping!
|
36
|
-
index_documents!(docs, &block)
|
37
|
-
end
|
38
|
-
|
39
|
-
def index_documents!(docs, &block)
|
40
|
-
for doc in docs
|
41
|
-
id = doc.id_uri.gsub('/', '-')
|
42
|
-
|
43
|
-
data = {
|
44
|
-
frbr_uri: doc.id_uri,
|
45
|
-
url: @base_url + doc.id_uri,
|
46
|
-
title: doc.short_title,
|
47
|
-
content: doc.body.text,
|
48
|
-
region: doc.region,
|
49
|
-
published_on: doc.publication['date'],
|
50
|
-
repealed: doc.repealed?,
|
51
|
-
}
|
52
|
-
|
53
|
-
yield doc, data if block_given?
|
54
|
-
|
55
|
-
@log.info("Indexing #{id}")
|
56
|
-
@es.index(index: @ix, type: @type, id: id, body: data)
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
def define_mapping!
|
61
|
-
@log.info("Deleting index")
|
62
|
-
@es.indices.create(index: @ix) unless @es.indices.exists(index: @ix)
|
63
|
-
|
64
|
-
# delete existing mapping
|
65
|
-
unless @es.indices.get_mapping(index: @ix, type: @type).empty?
|
66
|
-
@es.indices.delete_mapping(index: @ix, type: @type)
|
67
|
-
end
|
68
|
-
|
69
|
-
@log.info("Defining mappings")
|
70
|
-
@es.indices.put_mapping(index: @ix, type: @type, body: {
|
71
|
-
@type => {properties: @mapping}
|
72
|
-
})
|
73
|
-
end
|
74
|
-
|
75
|
-
def search(q, from=0, size=10)
|
76
|
-
@es.search(index: @ix, body: {
|
77
|
-
query: {
|
78
|
-
multi_match: {
|
79
|
-
query: q,
|
80
|
-
type: 'cross_fields',
|
81
|
-
fields: ['title', 'content'],
|
82
|
-
}
|
83
|
-
},
|
84
|
-
fields: ['frbr_uri', 'repealed', 'published_on', 'title', 'url', 'region_name'],
|
85
|
-
highlight: {
|
86
|
-
order: "score",
|
87
|
-
fields: {
|
88
|
-
content: {
|
89
|
-
fragment_size: 80,
|
90
|
-
number_of_fragments: 2,
|
91
|
-
},
|
92
|
-
title: {
|
93
|
-
number_of_fragments: 0, # entire field
|
94
|
-
}
|
95
|
-
},
|
96
|
-
pre_tags: ['<mark>'],
|
97
|
-
post_tags: ['</mark>'],
|
98
|
-
},
|
99
|
-
from: from,
|
100
|
-
size: size,
|
101
|
-
sort: {
|
102
|
-
'_score' => {order: 'desc'}
|
103
|
-
}
|
104
|
-
})
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|