slaw 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a0da8d0d88cfd753f8ef854937248c8881440390
4
- data.tar.gz: 0762a47f6b0bac65d4b3fe829bc2a4997f3b23f9
3
+ metadata.gz: 680301c5ade280eb7da5ea92c509f491631824f2
4
+ data.tar.gz: f2ddd5a99631121bf3693da5f229e38a6f590142
5
5
  SHA512:
6
- metadata.gz: 4ab788b276cd06d1735bb859a1f7fa08820f9bf65b2c6282df65ea1fd2303cbd5b42433366a3a0b2a7a20dbe227e78cc6b5caa2ab3b5cb988d6c2a27097f05ce
7
- data.tar.gz: 3882e5a3b292dfcd9adecb0b2077f9cb21a5d5e76e90402a09c77f146a1ec3acb0272649daf03cc64556864bfd5d8a921d873cbb20534c626542894a02218372
6
+ metadata.gz: 844130f24fa5e4e7e2acd8bacc9381bbd043591676a4fd22e9f1deec87e99b813f3062e4c4ec7286aca4ec0fe2a17161c39d85f5a07c8819192c82cd6203e474
7
+ data.tar.gz: de11ab3cb747c7341209e79f131506f6e2fc44065a73d95bb936c2b36b348646644024b0b657252106cd4c6d9f1b792ca4f7884e8f45fff4bda453da0a736cb7
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.1.0
5
+ - 2.1.1
data/Gemfile CHANGED
@@ -1,5 +1,4 @@
1
1
  source 'https://rubygems.org'
2
- ruby '2.1.1'
3
2
 
4
3
  # Specify your gem's dependencies in slaw.gemspec
5
4
  gemspec
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Slaw
1
+ # Slaw [![Build Status](https://travis-ci.org/longhotsummer/slaw.svg)](http://travis-ci.org/longhotsummer/slaw)
2
2
 
3
3
  Slaw is a lightweight library for rendering and generating Akoma Ntoso acts from plain text and PDF documents.
4
4
  It is used to power [openbylaws.org.za](http://openbylaws.org.za).
@@ -21,11 +21,33 @@ Or install it yourself as:
21
21
 
22
22
  TODO: Write usage instructions here
23
23
 
24
+ ### Extracting text from PDFs
25
+
26
+ You will need [xpdf](http://www.foolabs.com/xpdf/) to run PDF extraction. If you're
27
+ on a Mac you can use
28
+
29
+ brew install xpdf
30
+
31
+ Extracting PDFs often break lines in odd places (or doesn't break them when it should). Slaw gets around
32
+ this by running some cleanup routines on the extracted text.
33
+
34
+ ```ruby
35
+ extractor = Slaw::Extract::Extractor.new
36
+
37
+ # to guess the filetype by extension
38
+ text = extractor.extract_from_file('/path/to/file.pdf')
39
+
40
+ # or if you know it's a PDF
41
+ text = extractor.extract_from_pdf('/path/to/file.pdf')
42
+
43
+ # You can also "extract" text from a plain-text file
44
+ text = extractor.extract_from_text('/path/to/file.txt')
45
+ ```
46
+
24
47
  ## Contributing
25
48
 
26
- 1. Fork it ( http://github.com/longhotsummer/slaw/fork )
49
+ 1. Fork it at http://github.com/longhotsummer/slaw/fork
27
50
  2. Create your feature branch (`git checkout -b my-new-feature`)
28
51
  3. Commit your changes (`git commit -am 'Add some feature'`)
29
52
  4. Push to the branch (`git push origin my-new-feature`)
30
53
  5. Create new Pull Request
31
-
data/Rakefile CHANGED
@@ -5,3 +5,5 @@ begin
5
5
  RSpec::Core::RakeTask.new(:spec)
6
6
  rescue LoadError
7
7
  end
8
+
9
+ task default: [:spec]
data/lib/slaw.rb CHANGED
@@ -18,7 +18,8 @@ require 'slaw/parse/cleanser'
18
18
  require 'slaw/parse/error'
19
19
  require 'slaw/parse/grammar_helpers'
20
20
  require 'slaw/parse/nodes'
21
- require 'slaw/elasticsearch'
21
+
22
+ require 'slaw/extract/extractor'
22
23
 
23
24
  module Slaw
24
25
  end
data/lib/slaw/act.rb CHANGED
@@ -1,5 +1,12 @@
1
1
  module Slaw
2
- # Wraps an AkomaNtoso 2.0 XML document describing an Act.
2
+ # An Act wraps a single {http://www.akomantoso.org/ AkomaNtoso 2.0 XML} act document in the form of a
3
+ # Nokogiri::XML::Document object.
4
+ #
5
+ # The Act object provides quick access to certain sections of the document,
6
+ # such as the metadata and the body, as well as common operations such as
7
+ # identifying whether it has been amended ({#amended?}), repealed
8
+ # ({#repealed?}) or what chapters ({#chapters}), parts ({#parts}) and
9
+ # sections ({#sections}) it contains.
3
10
  class Act
4
11
  include Slaw::Namespace
5
12
 
@@ -7,19 +14,45 @@ module Slaw
7
14
  # Act instance itself
8
15
  @@acts = {}
9
16
 
10
- attr_accessor :doc, :meta, :body, :num, :year, :id_uri
11
- attr_accessor :filename, :mtime
17
+ # [Nokogiri::XML::Document] The underlying {Nokogiri::XML::Document} instance
18
+ attr_accessor :doc
19
+
20
+ # [Nokogiri::XML::Node] The `meta` XML node
21
+ attr_accessor :meta
22
+
23
+ # [Nokogiri::XML::Node] The `body` XML node
24
+ attr_accessor :body
25
+
26
+ # [String] The year this act was published
27
+ attr_accessor :year
28
+
29
+ # [String] The act number in the year this act was published
30
+ attr_accessor :num
31
+
32
+ # [String] The FRBR URI of this act, which uniquely identifies it globally
33
+ attr_accessor :id_uri
34
+
35
+ # [String, nil] The source filename, or nil
36
+ attr_accessor :filename
37
+
38
+ # [Time, nil] The mtime of when the source file was last modified
39
+ attr_accessor :mtime
12
40
 
41
+ # Get the act that wraps the document that owns this XML node
42
+ # @param node [Nokogiri::XML::Node]
43
+ # @return [Act] owning act
13
44
  def self.for_node(node)
14
45
  @@acts[node.document]
15
46
  end
16
47
 
17
- # Create a new instance
48
+ # Create a new instance, loading from `filename` if given.
49
+ # @param filename [String] filename to load XML from
18
50
  def initialize(filename=nil)
19
51
  self.load(filename) if filename
20
52
  end
21
53
 
22
- # Load the XML from +filename+
54
+ # Load the XML in `filename` into this instance
55
+ # @param filename [String] filename
23
56
  def load(filename)
24
57
  @filename = filename
25
58
  @mtime = File::mtime(@filename)
@@ -27,7 +60,8 @@ module Slaw
27
60
  File.open(filename) { |f| parse(f) }
28
61
  end
29
62
 
30
- # Parse the XML contained in the file-like object +io+
63
+ # Parse the XML contained in the file-like object `io`
64
+ # @param io [file-like] io object with XML
31
65
  def parse(io)
32
66
  @doc = Nokogiri::XML(io)
33
67
  @meta = @doc.at_xpath('/a:akomaNtoso/a:act/a:meta', a: NS)
@@ -35,10 +69,11 @@ module Slaw
35
69
 
36
70
  @@acts[@doc] = self
37
71
 
38
- extract_id
72
+ _extract_id
39
73
  end
40
74
 
41
- def extract_id
75
+ # Parse the FRBR Uri into its constituent parts
76
+ def _extract_id
42
77
  @id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
43
78
  empty, @country, type, date, @num = @id_uri.split('/')
44
79
 
@@ -46,48 +81,39 @@ module Slaw
46
81
  @year = date.split('-', 2)[0]
47
82
  end
48
83
 
84
+ # An applicable short title for this act, either from the `FRBRalias` element
85
+ # or based on the act number and year.
86
+ # @return [String]
49
87
  def short_title
50
- unless @short_title
51
- node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
52
- if node
53
- @short_title = node['value']
54
- else
55
- @short_title = "Act #{num} of #{year}"
56
- end
57
- end
58
-
59
- @short_title
60
- end
61
-
62
- def url_path
63
- "/#{@country}/acts/#{@year}/#{@num}/"
64
- end
65
-
66
- def url_file
67
- "act-#{@year}-#{@num}"
88
+ node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
89
+ node ? node['value'] : "Act #{num} of #{year}"
68
90
  end
69
91
 
70
- # Has this act been amended?
92
+ # Has this act been amended? This is determined by testing the `contains`
93
+ # attribute of the `act` root element.
94
+ #
95
+ # @return [Boolean]
71
96
  def amended?
72
97
  @doc.at_xpath('/a:akomaNtoso/a:act', a: NS)['contains'] != 'originalVersion'
73
98
  end
74
99
 
75
- # a list of LifecycleEvent objects for amendment events, in date order
100
+ # Get a list of {Slaw::LifecycleEvent} objects for amendment events, in date order.
101
+ # @return [Array<Slaw::LifecycleEvent>] possibly empty list of lifecycle events
76
102
  def amendment_events
77
103
  @meta.xpath('./a:lifecycle/a:eventRef[@type="amendment"]', a: NS).map do |event|
78
104
  LifecycleEvent.new(event)
79
105
  end.sort_by { |e| e.date }
80
106
  end
81
107
 
82
- # Mark this act as being amended by another act, either +act+
83
- # or the details in +opts+:
84
- #
85
- # :uri: uri of the amending act
86
- # :title: title of the amending act
87
- # :date: date of the amendment
108
+ # Mark this act as being amended by another act, either `act`
109
+ # or the details in `opts`.
88
110
  #
89
111
  # It is assumed that there can be only one amendment event on a particular
90
112
  # date. An existing amendment on this date is overwritten.
113
+ #
114
+ # @option opts [String] :uri uri of the amending act
115
+ # @option opts [String] :title title of the amending act
116
+ # @option opts [String] :date date of the amendment (YYYY-MM-DD)
91
117
  def amended_by!(act, opts={})
92
118
  if act
93
119
  opts[:uri] ||= act.id_uri
@@ -133,27 +159,40 @@ module Slaw
133
159
  end
134
160
 
135
161
  # Does this Act have parts?
162
+ # @return [Boolean]
136
163
  def parts?
137
164
  !parts.empty?
138
165
  end
139
166
 
167
+ # Top-level parts of this act. Parts inside chapters are ignored.
168
+ # @return [Array<Nokogiri::XML::Node>] part nodes
140
169
  def parts
141
170
  @body.xpath('./a:part', a: NS)
142
171
  end
143
172
 
173
+ # Does this Act have chapters?
174
+ # @return [Boolean]
144
175
  def chapters?
145
176
  !chapters.empty?
146
177
  end
147
178
 
179
+ # Top-level chapters of this act. Chapters inside parts are ignored.
180
+ # @return [Array<Nokogiri::XML::Node>] chapter nodes
148
181
  def chapters
149
182
  @body.xpath('./a:chapter', a: NS)
150
183
  end
151
184
 
185
+ # Sections of this act
186
+ # @return [Array<Nokogiri::XML::Node>] section nodes
152
187
  def sections
153
188
  @body.xpath('.//a:section', a: NS)
154
189
  end
155
190
 
156
- # The XML node representing the definitions section
191
+ # The primary definitions section of this act, identified by
192
+ # either an `id` of `definitions` or the first section with a heading
193
+ # of `Definitions`.
194
+ #
195
+ # @return [Nokogiri::XML::Node, nil] definitions node or nil
157
196
  def definitions
158
197
  # try looking for the definition list
159
198
  defn = @body.at_css('#definitions')
@@ -166,14 +205,21 @@ module Slaw
166
205
  nil
167
206
  end
168
207
 
169
- # The XML node representing the schedules document
208
+ # An act can contain schedules, additional (generally free-form) documents
209
+ # that are addendums to the the main body. A definition element must be
210
+ # part of a separate `component` and have a `doc` element with a name attribute
211
+ # of `schedules`.
212
+ #
213
+ # @return [Nokogiri::XML::Node, nil] schedules document node
170
214
  def schedules
171
215
  @doc.at_xpath('/a:akomaNtoso/a:components/a:component/a:doc[@name="schedules"]/a:mainBody', a: NS)
172
216
  end
173
217
 
174
- # Get a map from term ids to +[term, defn]+ pairs,
175
- # where +term+ is the text term NS+defn+ is
176
- # the XML node with the definition in it.
218
+ # Get a map from term ids to `[term, defn]` pairs,
219
+ # where `term+ is the plain text term and `defn` is
220
+ # the {Nokogiri::XML::Node} containing the definition.
221
+ #
222
+ # @return {String => List(String, Nokogiri::XML::Node)} map from strings to `[term, definition]` pairs
177
223
  def term_definitions
178
224
  terms = {}
179
225
 
@@ -191,23 +237,31 @@ module Slaw
191
237
  end
192
238
 
193
239
  # Returns the publication element, if any.
240
+ #
241
+ # @return [Nokogiri::XML::Node, nil]
194
242
  def publication
195
243
  @meta.at_xpath('./a:publication', a: NS)
196
244
  end
197
245
 
198
246
  # Has this by-law been repealed?
247
+ #
248
+ # @return [Boolean]
199
249
  def repealed?
200
250
  !!repealed_on
201
251
  end
202
252
 
203
253
  # The date on which this act was repealed, or nil if never repealed
254
+ #
255
+ # @return [String] date of repeal or nil
204
256
  def repealed_on
205
257
  repeal_el = repeal
206
258
  repeal_el ? Time.parse(repeal_el['date']) : nil
207
259
  end
208
260
 
209
261
  # The element representing the reference that caused the repeal of this
210
- # act, or nil
262
+ # act, or nil.
263
+ #
264
+ # @return [Nokogiri::XML::Node] element of reference to repealing act, or nil
211
265
  def repealed_by
212
266
  repeal_el = repeal
213
267
  return nil unless repeal_el
@@ -216,7 +270,9 @@ module Slaw
216
270
  @meta.at_xpath("./a:references/a:passiveRef[@id='#{source_id}']", a: NS)
217
271
  end
218
272
 
219
- # The XML element representing the repeal of this act, or nil
273
+ # The XML element representing the event of repeal of this act, or nil
274
+ #
275
+ # @return [Nokogiri::XML::Node]
220
276
  def repeal
221
277
  # <lifecycle source="#this">
222
278
  # <eventRef id="e1" date="2010-07-28" source="#original" type="generation"/>
@@ -226,11 +282,15 @@ module Slaw
226
282
  @meta.at_xpath('./a:lifecycle/a:eventRef[@type="repeal"]', a: NS)
227
283
  end
228
284
 
285
+ # The date at which this particular XML manifestation of this document was generated.
286
+ #
287
+ # @return [String] date, YYYY-MM-DD
229
288
  def manifestation_date
230
289
  node = @meta.at_xpath('./a:identification/a:FRBRManifestation/a:FRBRdate[@name="Generation"]', a: NS)
231
290
  node && node['date']
232
291
  end
233
292
 
293
+ # The underlying nature of this act, usually `act` although subclasses my override this.
234
294
  def nature
235
295
  "act"
236
296
  end
data/lib/slaw/bylaw.rb CHANGED
@@ -1,12 +1,19 @@
1
1
  require 'slaw/act'
2
2
 
3
3
  module Slaw
4
- # Wraps an AkomaNtoso XML document describing an Act classed as a By-Law
4
+ # An extension of {Slaw::Act} which wraps an AkomaNtoso XML document describing an By-Law.
5
+ #
6
+ # There are minor differences between Acts and By-laws, the most notable being that a by-law
7
+ # is not identified by a year and a number, and therefore has a different FRBR uri structure.
5
8
  class ByLaw < Act
6
9
 
7
- attr_accessor :region, :name
10
+ # [String] The region this by-law applies to
11
+ attr_accessor :region
12
+
13
+ # [String] A short file-like name of this by-law, unique within its year and region
14
+ attr_accessor :name
8
15
 
9
- def extract_id
16
+ def _extract_id
10
17
  # /za/by-law/cape-town/2010/public-parks
11
18
 
12
19
  @id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
@@ -22,30 +29,16 @@ module Slaw
22
29
  end
23
30
 
24
31
  def short_title
25
- unless @short_title
26
- node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
27
- if node
28
- @short_title = node['value']
29
- else
30
- @short_title = "(Unknown)"
31
- end
32
-
33
- if amended? and not @short_title.end_with?("as amended")
34
- @short_title = @short_title + " as amended"
35
- end
36
- end
32
+ node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
33
+ short_title = node ? node['value'] : "(Unknown)"
37
34
 
38
- @short_title
39
- end
35
+ if amended? and not short_title.end_with?("as amended")
36
+ short_title = short_title + " as amended"
37
+ end
40
38
 
41
- def url_path
42
- "/#{@country}/by-law/#{@region}/#{@year}/#{@name}/"
39
+ short_title
43
40
  end
44
41
 
45
- def url_file
46
- @name
47
- end
48
-
49
42
  def nature
50
43
  "by-law"
51
44
  end
@@ -2,11 +2,28 @@ require 'forwardable'
2
2
 
3
3
  module Slaw
4
4
  # A collection of Act instances.
5
+ #
6
+ # This is useful for looking up acts by their FRBR uri and for
7
+ # loading a collection of XML act documents.
8
+ #
9
+ # This collection is enumerable and can be iterated over. Use {#items} to
10
+ # access the underlying array of objects.
11
+ #
12
+ # @example Load a collection of acts and then iterate over them.
13
+ #
14
+ # acts = Slaw::DocumentCollection.new
15
+ # acts.discover('/path/to/acts/')
16
+ #
17
+ # for act in acts
18
+ # puts act.short_name
19
+ # end
20
+ #
5
21
  class DocumentCollection
6
22
 
7
23
  include Enumerable
8
24
  extend Forwardable
9
25
 
26
+ # [Array<Act>] The underlying array of acts
10
27
  attr_accessor :items
11
28
 
12
29
  def_delegators :items, :each, :<<, :length
@@ -15,16 +32,27 @@ module Slaw
15
32
  @items = items || []
16
33
  end
17
34
 
18
- # Find all XML files in +path+ and return
19
- # a list of instances of +cls+.
35
+ # Find all XML files in `path` and add them into this
36
+ # collection.
37
+ #
38
+ # @param path [String] the path to glob for xml files
39
+ # @param cls [Class] the class to instantiate for each file
40
+ #
41
+ # @return [DocumentCollection] this collection
20
42
  def discover(path, cls=Slaw::Act)
21
43
  for fname in Dir.glob("#{path}/**/*.xml")
22
44
  @items << cls.new(fname)
23
45
  end
46
+
47
+ self
24
48
  end
25
49
 
26
50
  # Try to find an act who's FRBRuri matches this one,
27
51
  # returning nil on failure
52
+ #
53
+ # @param uri [String] the uri to look for
54
+ #
55
+ # @return [Act, nil] the act, or nil
28
56
  def for_uri(uri)
29
57
  return @items.find { |doc| doc.id_uri == uri }
30
58
  end
@@ -0,0 +1,93 @@
1
+ require 'open3'
2
+
3
+ module Slaw
4
+ module Extract
5
+
6
+ # Routines for extracting and cleaning up context from other formats, such as PDF.
7
+ #
8
+ # You may need to set the location of the `pdftotext` binary.
9
+ #
10
+ # On Mac OS X, use `brew install xpdf` or download from http://www.foolabs.com/xpdf/download.html
11
+ #
12
+ # On Heroku, you'll need to do some hoop jumping, see http://theprogrammingbutler.com/blog/archives/2011/07/28/running-pdftotext-on-heroku/
13
+ class Extractor
14
+ include Slaw::Logging
15
+
16
+ @@pdftotext_path = "pdftotext"
17
+
18
+ # Object with text cleaning helpers
19
+ attr_accessor :cleanser
20
+
21
+ def initialize
22
+ @cleanser = Slaw::Parse::Cleanser.new
23
+ end
24
+
25
+ # Extract text from a file and run cleanup on it.
26
+ #
27
+ # @param filename [String] filename to extract from
28
+ #
29
+ # @return [String] extracted text
30
+ def extract_from_file(filename)
31
+ ext = filename[-4..-1].downcase
32
+
33
+ case ext
34
+ when '.pdf'
35
+ extract_from_pdf(filename)
36
+ when '.txt'
37
+ extract_from_text(filename)
38
+ else
39
+ raise ArgumentError.new("Unsupported file type #{ext}")
40
+ end
41
+ end
42
+
43
+ # Extract text from a PDF
44
+ #
45
+ # @param filename [String] filename to extract from
46
+ #
47
+ # @return [String] extracted text
48
+ def extract_from_pdf(filename)
49
+ cmd = pdf_to_text_cmd(filename)
50
+ logger.info("Executing: #{cmd}")
51
+ stdout, status = Open3.capture2(*cmd)
52
+
53
+ if status == 0
54
+ cleanup(stdout)
55
+ else
56
+ nil
57
+ end
58
+ end
59
+
60
+ # Build a command for the external PDF-to-text utility.
61
+ #
62
+ # @param filename [String] the pdf file
63
+ #
64
+ # @return [Array<String>] command and params to execute
65
+ def pdf_to_text_cmd(filename)
66
+ [Extractor.pdftotext_path, "-enc", "UTF-8", filename, "-"]
67
+ end
68
+
69
+ def extract_from_text(filename)
70
+ cleanup(File.read(filename))
71
+ end
72
+
73
+ # Run general once-off cleanup of extracted text.
74
+ def cleanup(text)
75
+ text = @cleanser.cleanup(text)
76
+ text = @cleanser.remove_empty_lines(text)
77
+ text = @cleanser.reformat(text)
78
+
79
+ text
80
+ end
81
+
82
+ # Get location of the pdftotext executable for all instances.
83
+ def self.pdftotext_path
84
+ @@pdftotext_path
85
+ end
86
+
87
+ # Set location of the pdftotext executable for all instances.
88
+ def self.pdftotext_path=(val)
89
+ @@pdftotext_path = val
90
+ end
91
+ end
92
+ end
93
+ end
@@ -3,28 +3,36 @@ module Slaw
3
3
  module Blocklists
4
4
  include Slaw::Namespace
5
5
 
6
- # Correctly re-nest nested block lists.
6
+ # Correctly re-nest nested block lists. We do this by identifying the
7
+ # numbering format of each item in the list and comparing it with the
8
+ # surrounding elements. When the numbering format changes, we start
9
+ # a new nested list.
7
10
  #
8
- # (a)
9
- # (b)
10
- # (i)
11
- # (ii)
12
- # (aa)
13
- # (bb)
14
- # (c)
15
- # (d)
11
+ # We make sure to handle special cases such as `(i)` coming between
12
+ # `(h)` and `(j)` versus being at the start of a `(i), (ii), (iii)`
13
+ # list.
16
14
  #
17
- # becomes
18
- #
19
- # (a)
20
- # (b)
21
- # (i)
22
- # (ii)
15
+ # (a)
16
+ # (b)
17
+ # (i)
18
+ # (ii)
23
19
  # (aa)
24
20
  # (bb)
25
- # (c)
26
- # (d)
21
+ # (c)
22
+ # (d)
23
+ #
24
+ # becomes
25
+ #
26
+ # (a)
27
+ # (b)
28
+ # (i)
29
+ # (ii)
30
+ # (aa)
31
+ # (bb)
32
+ # (c)
33
+ # (d)
27
34
  #
35
+ # @param doc [Nokogiri::XML::Document] the document
28
36
  def self.nest_blocklists(doc)
29
37
  doc.xpath('//a:blockList', a: NS).each do |blocklist|
30
38
  items = blocklist.xpath('a:item', a: NS)
@@ -1,25 +1,67 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'builder'
2
4
  require 'treetop'
3
5
 
4
6
  module Slaw
5
7
  module Parse
6
- # Primary class for building Akoma Ntoso documents.
8
+ # The primary class for building Akoma Ntoso documents from plain text documents.
9
+ #
10
+ # The builder uses a grammar to break down a plain-text version of an act into a
11
+ # syntax tree. This tree can then be serialized into an Akoma Ntoso compatible
12
+ # XML document.
13
+ #
14
+ # @example Parse some text into a well-formed document
15
+ # builder = Slaw::Builder.new
16
+ # xml = builder.parse_text(text)
17
+ # doc = builder.parse_xml(xml)
18
+ # builder.postprocess(doc)
19
+ #
20
+ # @example A quicker way to build a well-formed document
21
+ # builder = Slaw::Builder.new
22
+ # doc = builder.parse_and_process_text(text)
7
23
  #
8
- # It can convert from plain text a new Akoma Ntoso document, or
9
- # update existing documents.
10
24
  class Builder
11
25
  include Slaw::Namespace
12
26
  include Slaw::Logging
13
27
 
14
28
  Treetop.load(File.dirname(__FILE__) + "/bylaw.treetop")
15
29
 
30
+ # [Hash] A Hash of options that are made available to the parser when parsing.
16
31
  attr_accessor :parse_options
17
32
 
18
- def initialize()
19
- @parse_options = {}
33
+ def initialize(parse_options={})
34
+ @parse_options = parse_options
35
+ end
36
+
37
+ # Do all the work necessary to parse text into a well-formed XML document.
38
+ #
39
+ # @param text [String] the text to parse
40
+ # @param root [Symbol] the root element of the grammar
41
+ #
42
+ # @return [Nokogiri::XML::Document] a well formed document
43
+ def parse_and_process_text(text, root=:bylaw)
44
+ postprocess(parse_xml(parse_text(text, root)))
45
+ end
46
+
47
+ # Parse text into XML. You should still run {#postprocess} on the
48
+ # resulting XML to normalise it.
49
+ #
50
+ # @param text [String] the text to parse
51
+ # @param root [Symbol] the root element of the grammar
52
+ #
53
+ # @return [String] an XML string
54
+ def parse_text(text, root=:bylaw)
55
+ tree = text_to_syntax_tree(text, root)
56
+ xml_from_syntax_tree(tree)
20
57
  end
21
58
 
22
- # Try to parse plain text into a syntax tree
59
+ # Parse plain text into a syntax tree.
60
+ #
61
+ # @param text [String] the text to parse
62
+ # @param root [Symbol] the root element of the grammar
63
+ #
64
+ # @return [Object] the root of the resulting parse tree, usually a Treetop::Node object
23
65
  def text_to_syntax_tree(text, root=:bylaw)
24
66
  parser = Slaw::Parse::BylawParser.new
25
67
  parser.options = @parse_options
@@ -35,7 +77,12 @@ module Slaw
35
77
  tree
36
78
  end
37
79
 
38
- # Generate an XML document from the given syntax tree.
80
+ # Generate an XML document from the given syntax tree. You should still
81
+ # run {#postprocess} on the resulting XML to normalise it.
82
+ #
83
+ # @param tree [Object] a Treetop::Node object
84
+ #
85
+ # @return [String] an XML string
39
86
  def xml_from_syntax_tree(tree)
40
87
  s = ""
41
88
  builder = ::Builder::XmlMarkup.new(indent: 2, target: s)
@@ -50,38 +97,41 @@ module Slaw
50
97
  s
51
98
  end
52
99
 
100
+ # Parse a string into a Nokogiri::XML::Document
101
+ #
102
+ # @param xml [String] string to parse
103
+ #
104
+ # @return [Nokogiri::XML::Document]
53
105
  def parse_xml(xml)
54
106
  Nokogiri::XML(xml, &:noblanks)
55
107
  end
56
108
 
109
+ # Serialise a Nokogiri::XML::Document into a string
110
+ #
111
+ # @param doc [Nokogiri::XML::Document] document
112
+ #
113
+ # @return [String] pretty printed string
57
114
  def to_xml(doc)
58
115
  doc.to_xml(indent: 2)
59
116
  end
60
117
 
61
- # Run various postprocesses on the XML, and return
62
- # the updated XML.
118
+ # Postprocess an XML document.
119
+ #
120
+ # @param doc [Nokogiri::XML::Document]
121
+ #
122
+ # @return [Nokogiri::XML::Document] the updated document
63
123
  def postprocess(doc)
64
124
  normalise_headings(doc)
65
125
  find_short_title(doc)
66
- sanitise(doc)
67
- end
68
-
69
- # Do sanitisations, such as finding and linking definitions
70
- def sanitise(doc)
71
126
  link_definitions(doc)
72
127
  nest_blocklists(doc)
73
- end
74
128
 
75
- # recalculate ids for <term> elements
76
- def renumber_terms(doc)
77
- logger.info("Renumbering terms")
78
-
79
- doc.xpath('//a:term', a: NS).each_with_index do |term, i|
80
- term['id'] = "trm#{i}"
81
- end
129
+ doc
82
130
  end
83
131
 
84
132
  # Change CAPCASE headings into Sentence case.
133
+ #
134
+ # @param doc [Nokogiri::XML::Document]
85
135
  def normalise_headings(doc)
86
136
  logger.info("Normalising headings")
87
137
 
@@ -94,6 +144,8 @@ module Slaw
94
144
  end
95
145
 
96
146
  # Find the short title and add it as an FRBRalias element in the meta section
147
+ #
148
+ # @param doc [Nokogiri::XML::Document]
97
149
  def find_short_title(doc)
98
150
  logger.info("Finding short title")
99
151
 
@@ -117,6 +169,8 @@ module Slaw
117
169
 
118
170
  # Find definitions of terms and introduce them into the
119
171
  # meta section of the document.
172
+ #
173
+ # @param doc [Nokogiri::XML::Document]
120
174
  def link_definitions(doc)
121
175
  logger.info("Finding and linking definitions")
122
176
 
@@ -126,6 +180,12 @@ module Slaw
126
180
  renumber_terms(doc)
127
181
  end
128
182
 
183
+ # Find `def` elements in the document and return a Hash from
184
+ # term ids to the text of each term
185
+ #
186
+ # @param doc [Nokogiri::XML::Document]
187
+ #
188
+ # @return [Hash{String, String}]
129
189
  def find_definitions(doc)
130
190
  guess_at_definitions(doc)
131
191
 
@@ -239,6 +299,21 @@ module Slaw
239
299
  end
240
300
  end
241
301
 
302
+ # recalculate ids for <term> elements
303
+ def renumber_terms(doc)
304
+ logger.info("Renumbering terms")
305
+
306
+ doc.xpath('//a:term', a: NS).each_with_index do |term, i|
307
+ term['id'] = "trm#{i}"
308
+ end
309
+ end
310
+
311
+ # Correctly nest blocklists.
312
+ #
313
+ # The grammar gives us flat blocklists, we need to introspect the
314
+ # numbering of the lists to correctly nest them.
315
+ #
316
+ # @param doc [Nokogiri::XML::Document]
242
317
  def nest_blocklists(doc)
243
318
  logger.info("Nesting blocklists")
244
319
 
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'slaw/parse/grammar_helpers'
2
4
 
3
5
  module Slaw
@@ -50,16 +50,17 @@ module Slaw
50
50
  .gsub(" ", '')
51
51
  end
52
52
 
53
+ # change weird quotes to normal ones
53
54
  def fix_quotes(s)
54
- # change weird quotes to normal ones
55
55
  s.gsub(/‘‘|’’|''/, '"')
56
56
  end
57
57
 
58
+ # tabs to spaces
58
59
  def expand_tabs(s)
59
- # tabs to spaces
60
60
  s.gsub(/\t/, ' ')
61
61
  end
62
62
 
63
+ # Try to remove boilerplate lines found in many files, such as page numbers.
63
64
  def remove_boilerplate(s)
64
65
  # nuke any line to do with Sabinet and the government printer
65
66
  s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
@@ -72,6 +73,8 @@ module Slaw
72
73
  .gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
73
74
  end
74
75
 
76
+ # Get rid of whitespace at the end of lines and at the start and end of the
77
+ # entire string.
75
78
  def chomp(s)
76
79
  # trailing whitespace at end of lines
77
80
  s = s.gsub(/ +$/, '')
@@ -85,8 +88,11 @@ module Slaw
85
88
  s.end_with?("\n") ? s : (s + "\n")
86
89
  end
87
90
 
88
- # make educated guesses about lines that should
89
- # have been broken but haven't, and break them
91
+ # Make educated guesses about lines that should
92
+ # have been broken but haven't, and break them.
93
+ #
94
+ # This is very dependent on a locale's legislation grammar, there are
95
+ # lots of rules of thumb that make this work.
90
96
  def break_lines(s)
91
97
  # often we find a section title munged onto the same line as its first statement
92
98
  # eg:
@@ -115,8 +121,8 @@ module Slaw
115
121
  s
116
122
  end
117
123
 
118
- # finds likely candidates for unnecessarily broken lines
119
- # and them
124
+ # Find likely candidates for unnecessarily broken lines
125
+ # and unbreaks them.
120
126
  def unbreak_lines(s)
121
127
  lines = s.split(/\n/)
122
128
  output = []
@@ -141,8 +147,8 @@ module Slaw
141
147
  output.join("\n")
142
148
  end
143
149
 
144
- # do our best to remove table of contents at the start,
145
- # it really confuses the grammer
150
+ # Do our best to remove table of contents at the start,
151
+ # it really confuses the grammer.
146
152
  def strip_toc(s)
147
153
  # first, try to find 'TABLE OF CONTENTS' anywhere within the first 4K of text,
148
154
  if toc_start = s[0..4096].match(/TABLE OF CONTENTS/i)
@@ -1,5 +1,9 @@
1
1
  module Slaw
2
2
  module Parse
3
+ # These helpers are mixed into the treetop grammar and provide a means for
4
+ # exposing options into the grammar.
5
+ #
6
+ # @see Builder#parse_options
3
7
  module GrammarHelpers
4
8
  attr_writer :options
5
9
 
@@ -2,7 +2,14 @@ module Slaw
2
2
  module Render
3
3
 
4
4
  # Support for transforming XML AN documents into HTML.
5
+ #
6
+ # This rendering is done using XSLT stylesheets. Both an entire
7
+ # document and fragments can be rendered.
5
8
  class HTMLRenderer
9
+
10
+ # [Hash] A Hash of Nokogiri::XSLT objects
11
+ attr_accessor :xslt
12
+
6
13
  def initialize
7
14
  here = File.dirname(__FILE__)
8
15
 
@@ -12,12 +19,17 @@ module Slaw
12
19
  }
13
20
  end
14
21
 
15
- # Transform an entire XML document +doc+ (a Nokogiri::XML::Document object) into HTML.
16
- # Specify +base_url+ to manage the base for relative URLs generated by
22
+ # Transform an entire XML document (a Nokogiri::XML::Document object) into HTML.
23
+ # Specify `base_url` to manage the base for relative URLs generated by
17
24
  # the transform.
25
+ #
26
+ # @param doc [Nokogiri::XML::Document] document to render
27
+ # @param base_url [String] root URL for relative URLs (cannot be empty)
28
+ #
29
+ # @return [String]
18
30
  def render(doc, base_url='')
19
- params = transform_params({'base_url' => base_url})
20
- run_xslt(:act, doc, params)
31
+ params = _transform_params({'base_url' => base_url})
32
+ _run_xslt(:act, doc, params)
21
33
  end
22
34
 
23
35
  # Transform just a single node and its children into HTML.
@@ -25,8 +37,13 @@ module Slaw
25
37
  # If +elem+ has an id, we use xpath to tell the XSLT which
26
38
  # element to transform. Otherwise we copy the node into a new
27
39
  # tree and apply the XSLT to that.
40
+ #
41
+ # @param node [Nokogiri::XML::Node] node to render
42
+ # @param base_url [String] root URL for relative URLs (cannot be empty)
43
+ #
44
+ # @return [String]
28
45
  def render_node(node, base_url='')
29
- params = transform_params({'base_url' => base_url})
46
+ params = _transform_params({'base_url' => base_url})
30
47
 
31
48
  if node.id
32
49
  params += ['root_elem', "//*[@id='#{node.id}']"]
@@ -38,14 +55,14 @@ module Slaw
38
55
  params += ['root_elem', '*']
39
56
  end
40
57
 
41
- run_xslt(:fragment, doc, params)
58
+ _run_xslt(:fragment, doc, params)
42
59
  end
43
60
 
44
- def run_xslt(xslt, doc, params)
61
+ def _run_xslt(xslt, doc, params)
45
62
  @xslt[xslt].transform(doc, params).to_s
46
63
  end
47
64
 
48
- def transform_params(params)
65
+ def _transform_params(params)
49
66
  Nokogiri::XSLT.quote_params(params)
50
67
  end
51
68
  end
@@ -77,7 +77,7 @@
77
77
  <xsl:value-of select="@refersTo" />
78
78
  </xsl:attribute>
79
79
 
80
- <xsl:attribute name="href"><xsl:value-of select="$base_url" />definitions/#def-<xsl:value-of select="translate(@refersTo, '#', '')" /></xsl:attribute>
80
+ <xsl:attribute name="href"><xsl:value-of select="$base_url" />/definitions/#def-<xsl:value-of select="translate(@refersTo, '#', '')" /></xsl:attribute>
81
81
 
82
82
  <xsl:apply-templates />
83
83
  </a>
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
data/slaw.gemspec CHANGED
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rspec", "~> 2.14.1"
24
24
 
25
25
  spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
26
- spec.add_runtime_dependency "elasticsearch", "~> 1.0.5"
27
26
  spec.add_runtime_dependency "treetop", "~> 1.5"
28
27
  spec.add_runtime_dependency "builder", "~> 3.2.2"
29
28
  spec.add_runtime_dependency "log4r", "~> 1.1.10"
@@ -0,0 +1,14 @@
1
+ require 'tempfile'
2
+
3
+ require 'spec_helper'
4
+ require 'slaw'
5
+
6
+ describe Slaw::Extract::Extractor do
7
+ it 'should extract from plain text' do
8
+ f = Tempfile.new(['test', '.txt'])
9
+ f.write('This is some text')
10
+ f.rewind
11
+
12
+ subject.extract_from_file(f.path).should == "This is some text\n"
13
+ end
14
+ end
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'spec_helper'
2
4
  require 'slaw'
3
5
 
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'slaw'
2
4
  require 'builder'
3
5
 
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'spec_helper'
2
4
 
3
5
  require 'slaw'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-17 00:00:00.000000000 Z
11
+ date: 2014-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,20 +66,6 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 1.6.0
69
- - !ruby/object:Gem::Dependency
70
- name: elasticsearch
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: 1.0.5
76
- type: :runtime
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: 1.0.5
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: treetop
85
71
  requirement: !ruby/object:Gem::Requirement
@@ -131,6 +117,7 @@ extensions: []
131
117
  extra_rdoc_files: []
132
118
  files:
133
119
  - ".gitignore"
120
+ - ".travis.yml"
134
121
  - Gemfile
135
122
  - LICENSE.txt
136
123
  - README.md
@@ -139,7 +126,7 @@ files:
139
126
  - lib/slaw/act.rb
140
127
  - lib/slaw/bylaw.rb
141
128
  - lib/slaw/collection.rb
142
- - lib/slaw/elasticsearch.rb
129
+ - lib/slaw/extract/extractor.rb
143
130
  - lib/slaw/lifecycle_event.rb
144
131
  - lib/slaw/logging.rb
145
132
  - lib/slaw/namespace.rb
@@ -157,6 +144,7 @@ files:
157
144
  - lib/slaw/version.rb
158
145
  - lib/slaw/xml_support.rb
159
146
  - slaw.gemspec
147
+ - spec/extract/extractor_spec.rb
160
148
  - spec/parse/builder_spec.rb
161
149
  - spec/parse/bylaw_spec.rb
162
150
  - spec/parse/cleanser_spec.rb
@@ -187,6 +175,7 @@ signing_key:
187
175
  specification_version: 4
188
176
  summary: A lightweight library for using Akoma Ntoso acts in Ruby.
189
177
  test_files:
178
+ - spec/extract/extractor_spec.rb
190
179
  - spec/parse/builder_spec.rb
191
180
  - spec/parse/bylaw_spec.rb
192
181
  - spec/parse/cleanser_spec.rb
@@ -1,107 +0,0 @@
1
- require 'elasticsearch'
2
- require 'log4r'
3
-
4
- module Slaw
5
- # Support for indexing and search using elasticsearch
6
- class ElasticSearchSupport
7
- attr_accessor :es, :mapping, :index, :type, :base_url
8
-
9
- def initialize(index, type, base_url, client_params={}, es=nil)
10
- @es = es || create_client(client_params)
11
-
12
- @ix = index
13
- @type = type
14
- @base_url = base_url
15
-
16
- @mapping = {
17
- frbr_uri: {type: 'string', index: 'not_analyzed'},
18
- url: {type: 'string', index: 'not_analyzed'},
19
- title: {type: 'string', analyzer: 'english'},
20
- content: {type: 'string', analyzer: 'english'},
21
- published_on: {type: 'date', format: 'dateOptionalTime'},
22
- region: {type: 'string', index: 'not_analyzed'},
23
- region_name: {type: 'string', index: 'not_analyzed'},
24
- repealed: {type: 'boolean'},
25
- }
26
-
27
- @log = Log4r::Logger['Slaw']
28
- end
29
-
30
- def create_client(client_params)
31
- Elasticsearch::Client.new(client_params)
32
- end
33
-
34
- def reindex!(docs, &block)
35
- define_mapping!
36
- index_documents!(docs, &block)
37
- end
38
-
39
- def index_documents!(docs, &block)
40
- for doc in docs
41
- id = doc.id_uri.gsub('/', '-')
42
-
43
- data = {
44
- frbr_uri: doc.id_uri,
45
- url: @base_url + doc.id_uri,
46
- title: doc.short_title,
47
- content: doc.body.text,
48
- region: doc.region,
49
- published_on: doc.publication['date'],
50
- repealed: doc.repealed?,
51
- }
52
-
53
- yield doc, data if block_given?
54
-
55
- @log.info("Indexing #{id}")
56
- @es.index(index: @ix, type: @type, id: id, body: data)
57
- end
58
- end
59
-
60
- def define_mapping!
61
- @log.info("Deleting index")
62
- @es.indices.create(index: @ix) unless @es.indices.exists(index: @ix)
63
-
64
- # delete existing mapping
65
- unless @es.indices.get_mapping(index: @ix, type: @type).empty?
66
- @es.indices.delete_mapping(index: @ix, type: @type)
67
- end
68
-
69
- @log.info("Defining mappings")
70
- @es.indices.put_mapping(index: @ix, type: @type, body: {
71
- @type => {properties: @mapping}
72
- })
73
- end
74
-
75
- def search(q, from=0, size=10)
76
- @es.search(index: @ix, body: {
77
- query: {
78
- multi_match: {
79
- query: q,
80
- type: 'cross_fields',
81
- fields: ['title', 'content'],
82
- }
83
- },
84
- fields: ['frbr_uri', 'repealed', 'published_on', 'title', 'url', 'region_name'],
85
- highlight: {
86
- order: "score",
87
- fields: {
88
- content: {
89
- fragment_size: 80,
90
- number_of_fragments: 2,
91
- },
92
- title: {
93
- number_of_fragments: 0, # entire field
94
- }
95
- },
96
- pre_tags: ['<mark>'],
97
- post_tags: ['</mark>'],
98
- },
99
- from: from,
100
- size: size,
101
- sort: {
102
- '_score' => {order: 'desc'}
103
- }
104
- })
105
- end
106
- end
107
- end