slaw 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a0da8d0d88cfd753f8ef854937248c8881440390
4
- data.tar.gz: 0762a47f6b0bac65d4b3fe829bc2a4997f3b23f9
3
+ metadata.gz: 680301c5ade280eb7da5ea92c509f491631824f2
4
+ data.tar.gz: f2ddd5a99631121bf3693da5f229e38a6f590142
5
5
  SHA512:
6
- metadata.gz: 4ab788b276cd06d1735bb859a1f7fa08820f9bf65b2c6282df65ea1fd2303cbd5b42433366a3a0b2a7a20dbe227e78cc6b5caa2ab3b5cb988d6c2a27097f05ce
7
- data.tar.gz: 3882e5a3b292dfcd9adecb0b2077f9cb21a5d5e76e90402a09c77f146a1ec3acb0272649daf03cc64556864bfd5d8a921d873cbb20534c626542894a02218372
6
+ metadata.gz: 844130f24fa5e4e7e2acd8bacc9381bbd043591676a4fd22e9f1deec87e99b813f3062e4c4ec7286aca4ec0fe2a17161c39d85f5a07c8819192c82cd6203e474
7
+ data.tar.gz: de11ab3cb747c7341209e79f131506f6e2fc44065a73d95bb936c2b36b348646644024b0b657252106cd4c6d9f1b792ca4f7884e8f45fff4bda453da0a736cb7
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.1.0
5
+ - 2.1.1
data/Gemfile CHANGED
@@ -1,5 +1,4 @@
1
1
  source 'https://rubygems.org'
2
- ruby '2.1.1'
3
2
 
4
3
  # Specify your gem's dependencies in slaw.gemspec
5
4
  gemspec
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Slaw
1
+ # Slaw [![Build Status](https://travis-ci.org/longhotsummer/slaw.svg)](http://travis-ci.org/longhotsummer/slaw)
2
2
 
3
3
  Slaw is a lightweight library for rendering and generating Akoma Ntoso acts from plain text and PDF documents.
4
4
  It is used to power [openbylaws.org.za](http://openbylaws.org.za).
@@ -21,11 +21,33 @@ Or install it yourself as:
21
21
 
22
22
  TODO: Write usage instructions here
23
23
 
24
+ ### Extracting text from PDFs
25
+
26
+ You will need [xpdf](http://www.foolabs.com/xpdf/) to run PDF extraction. If you're
27
+ on a Mac you can use
28
+
29
+ brew install xpdf
30
+
31
+ Extracting PDFs often break lines in odd places (or doesn't break them when it should). Slaw gets around
32
+ this by running some cleanup routines on the extracted text.
33
+
34
+ ```ruby
35
+ extractor = Slaw::Extract::Extractor.new
36
+
37
+ # to guess the filetype by extension
38
+ text = extractor.extract_from_file('/path/to/file.pdf')
39
+
40
+ # or if you know it's a PDF
41
+ text = extractor.extract_from_pdf('/path/to/file.pdf')
42
+
43
+ # You can also "extract" text from a plain-text file
44
+ text = extractor.extract_from_text('/path/to/file.txt')
45
+ ```
46
+
24
47
  ## Contributing
25
48
 
26
- 1. Fork it ( http://github.com/longhotsummer/slaw/fork )
49
+ 1. Fork it at http://github.com/longhotsummer/slaw/fork
27
50
  2. Create your feature branch (`git checkout -b my-new-feature`)
28
51
  3. Commit your changes (`git commit -am 'Add some feature'`)
29
52
  4. Push to the branch (`git push origin my-new-feature`)
30
53
  5. Create new Pull Request
31
-
data/Rakefile CHANGED
@@ -5,3 +5,5 @@ begin
5
5
  RSpec::Core::RakeTask.new(:spec)
6
6
  rescue LoadError
7
7
  end
8
+
9
+ task default: [:spec]
data/lib/slaw.rb CHANGED
@@ -18,7 +18,8 @@ require 'slaw/parse/cleanser'
18
18
  require 'slaw/parse/error'
19
19
  require 'slaw/parse/grammar_helpers'
20
20
  require 'slaw/parse/nodes'
21
- require 'slaw/elasticsearch'
21
+
22
+ require 'slaw/extract/extractor'
22
23
 
23
24
  module Slaw
24
25
  end
data/lib/slaw/act.rb CHANGED
@@ -1,5 +1,12 @@
1
1
  module Slaw
2
- # Wraps an AkomaNtoso 2.0 XML document describing an Act.
2
+ # An Act wraps a single {http://www.akomantoso.org/ AkomaNtoso 2.0 XML} act document in the form of a
3
+ # Nokogiri::XML::Document object.
4
+ #
5
+ # The Act object provides quick access to certain sections of the document,
6
+ # such as the metadata and the body, as well as common operations such as
7
+ # identifying whether it has been amended ({#amended?}), repealed
8
+ # ({#repealed?}) or what chapters ({#chapters}), parts ({#parts}) and
9
+ # sections ({#sections}) it contains.
3
10
  class Act
4
11
  include Slaw::Namespace
5
12
 
@@ -7,19 +14,45 @@ module Slaw
7
14
  # Act instance itself
8
15
  @@acts = {}
9
16
 
10
- attr_accessor :doc, :meta, :body, :num, :year, :id_uri
11
- attr_accessor :filename, :mtime
17
+ # [Nokogiri::XML::Document] The underlying {Nokogiri::XML::Document} instance
18
+ attr_accessor :doc
19
+
20
+ # [Nokogiri::XML::Node] The `meta` XML node
21
+ attr_accessor :meta
22
+
23
+ # [Nokogiri::XML::Node] The `body` XML node
24
+ attr_accessor :body
25
+
26
+ # [String] The year this act was published
27
+ attr_accessor :year
28
+
29
+ # [String] The act number in the year this act was published
30
+ attr_accessor :num
31
+
32
+ # [String] The FRBR URI of this act, which uniquely identifies it globally
33
+ attr_accessor :id_uri
34
+
35
+ # [String, nil] The source filename, or nil
36
+ attr_accessor :filename
37
+
38
+ # [Time, nil] The mtime of when the source file was last modified
39
+ attr_accessor :mtime
12
40
 
41
+ # Get the act that wraps the document that owns this XML node
42
+ # @param node [Nokogiri::XML::Node]
43
+ # @return [Act] owning act
13
44
  def self.for_node(node)
14
45
  @@acts[node.document]
15
46
  end
16
47
 
17
- # Create a new instance
48
+ # Create a new instance, loading from `filename` if given.
49
+ # @param filename [String] filename to load XML from
18
50
  def initialize(filename=nil)
19
51
  self.load(filename) if filename
20
52
  end
21
53
 
22
- # Load the XML from +filename+
54
+ # Load the XML in `filename` into this instance
55
+ # @param filename [String] filename
23
56
  def load(filename)
24
57
  @filename = filename
25
58
  @mtime = File::mtime(@filename)
@@ -27,7 +60,8 @@ module Slaw
27
60
  File.open(filename) { |f| parse(f) }
28
61
  end
29
62
 
30
- # Parse the XML contained in the file-like object +io+
63
+ # Parse the XML contained in the file-like object `io`
64
+ # @param io [file-like] io object with XML
31
65
  def parse(io)
32
66
  @doc = Nokogiri::XML(io)
33
67
  @meta = @doc.at_xpath('/a:akomaNtoso/a:act/a:meta', a: NS)
@@ -35,10 +69,11 @@ module Slaw
35
69
 
36
70
  @@acts[@doc] = self
37
71
 
38
- extract_id
72
+ _extract_id
39
73
  end
40
74
 
41
- def extract_id
75
+ # Parse the FRBR Uri into its constituent parts
76
+ def _extract_id
42
77
  @id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
43
78
  empty, @country, type, date, @num = @id_uri.split('/')
44
79
 
@@ -46,48 +81,39 @@ module Slaw
46
81
  @year = date.split('-', 2)[0]
47
82
  end
48
83
 
84
+ # An applicable short title for this act, either from the `FRBRalias` element
85
+ # or based on the act number and year.
86
+ # @return [String]
49
87
  def short_title
50
- unless @short_title
51
- node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
52
- if node
53
- @short_title = node['value']
54
- else
55
- @short_title = "Act #{num} of #{year}"
56
- end
57
- end
58
-
59
- @short_title
60
- end
61
-
62
- def url_path
63
- "/#{@country}/acts/#{@year}/#{@num}/"
64
- end
65
-
66
- def url_file
67
- "act-#{@year}-#{@num}"
88
+ node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
89
+ node ? node['value'] : "Act #{num} of #{year}"
68
90
  end
69
91
 
70
- # Has this act been amended?
92
+ # Has this act been amended? This is determined by testing the `contains`
93
+ # attribute of the `act` root element.
94
+ #
95
+ # @return [Boolean]
71
96
  def amended?
72
97
  @doc.at_xpath('/a:akomaNtoso/a:act', a: NS)['contains'] != 'originalVersion'
73
98
  end
74
99
 
75
- # a list of LifecycleEvent objects for amendment events, in date order
100
+ # Get a list of {Slaw::LifecycleEvent} objects for amendment events, in date order.
101
+ # @return [Array<Slaw::LifecycleEvent>] possibly empty list of lifecycle events
76
102
  def amendment_events
77
103
  @meta.xpath('./a:lifecycle/a:eventRef[@type="amendment"]', a: NS).map do |event|
78
104
  LifecycleEvent.new(event)
79
105
  end.sort_by { |e| e.date }
80
106
  end
81
107
 
82
- # Mark this act as being amended by another act, either +act+
83
- # or the details in +opts+:
84
- #
85
- # :uri: uri of the amending act
86
- # :title: title of the amending act
87
- # :date: date of the amendment
108
+ # Mark this act as being amended by another act, either `act`
109
+ # or the details in `opts`.
88
110
  #
89
111
  # It is assumed that there can be only one amendment event on a particular
90
112
  # date. An existing amendment on this date is overwritten.
113
+ #
114
+ # @option opts [String] :uri uri of the amending act
115
+ # @option opts [String] :title title of the amending act
116
+ # @option opts [String] :date date of the amendment (YYYY-MM-DD)
91
117
  def amended_by!(act, opts={})
92
118
  if act
93
119
  opts[:uri] ||= act.id_uri
@@ -133,27 +159,40 @@ module Slaw
133
159
  end
134
160
 
135
161
  # Does this Act have parts?
162
+ # @return [Boolean]
136
163
  def parts?
137
164
  !parts.empty?
138
165
  end
139
166
 
167
+ # Top-level parts of this act. Parts inside chapters are ignored.
168
+ # @return [Array<Nokogiri::XML::Node>] part nodes
140
169
  def parts
141
170
  @body.xpath('./a:part', a: NS)
142
171
  end
143
172
 
173
+ # Does this Act have chapters?
174
+ # @return [Boolean]
144
175
  def chapters?
145
176
  !chapters.empty?
146
177
  end
147
178
 
179
+ # Top-level chapters of this act. Chapters inside parts are ignored.
180
+ # @return [Array<Nokogiri::XML::Node>] chapter nodes
148
181
  def chapters
149
182
  @body.xpath('./a:chapter', a: NS)
150
183
  end
151
184
 
185
+ # Sections of this act
186
+ # @return [Array<Nokogiri::XML::Node>] section nodes
152
187
  def sections
153
188
  @body.xpath('.//a:section', a: NS)
154
189
  end
155
190
 
156
- # The XML node representing the definitions section
191
+ # The primary definitions section of this act, identified by
192
+ # either an `id` of `definitions` or the first section with a heading
193
+ # of `Definitions`.
194
+ #
195
+ # @return [Nokogiri::XML::Node, nil] definitions node or nil
157
196
  def definitions
158
197
  # try looking for the definition list
159
198
  defn = @body.at_css('#definitions')
@@ -166,14 +205,21 @@ module Slaw
166
205
  nil
167
206
  end
168
207
 
169
- # The XML node representing the schedules document
208
+ # An act can contain schedules, additional (generally free-form) documents
209
+ # that are addendums to the the main body. A definition element must be
210
+ # part of a separate `component` and have a `doc` element with a name attribute
211
+ # of `schedules`.
212
+ #
213
+ # @return [Nokogiri::XML::Node, nil] schedules document node
170
214
  def schedules
171
215
  @doc.at_xpath('/a:akomaNtoso/a:components/a:component/a:doc[@name="schedules"]/a:mainBody', a: NS)
172
216
  end
173
217
 
174
- # Get a map from term ids to +[term, defn]+ pairs,
175
- # where +term+ is the text term NS+defn+ is
176
- # the XML node with the definition in it.
218
+ # Get a map from term ids to `[term, defn]` pairs,
219
+ # where `term+ is the plain text term and `defn` is
220
+ # the {Nokogiri::XML::Node} containing the definition.
221
+ #
222
+ # @return {String => List(String, Nokogiri::XML::Node)} map from strings to `[term, definition]` pairs
177
223
  def term_definitions
178
224
  terms = {}
179
225
 
@@ -191,23 +237,31 @@ module Slaw
191
237
  end
192
238
 
193
239
  # Returns the publication element, if any.
240
+ #
241
+ # @return [Nokogiri::XML::Node, nil]
194
242
  def publication
195
243
  @meta.at_xpath('./a:publication', a: NS)
196
244
  end
197
245
 
198
246
  # Has this by-law been repealed?
247
+ #
248
+ # @return [Boolean]
199
249
  def repealed?
200
250
  !!repealed_on
201
251
  end
202
252
 
203
253
  # The date on which this act was repealed, or nil if never repealed
254
+ #
255
+ # @return [String] date of repeal or nil
204
256
  def repealed_on
205
257
  repeal_el = repeal
206
258
  repeal_el ? Time.parse(repeal_el['date']) : nil
207
259
  end
208
260
 
209
261
  # The element representing the reference that caused the repeal of this
210
- # act, or nil
262
+ # act, or nil.
263
+ #
264
+ # @return [Nokogiri::XML::Node] element of reference to repealing act, or nil
211
265
  def repealed_by
212
266
  repeal_el = repeal
213
267
  return nil unless repeal_el
@@ -216,7 +270,9 @@ module Slaw
216
270
  @meta.at_xpath("./a:references/a:passiveRef[@id='#{source_id}']", a: NS)
217
271
  end
218
272
 
219
- # The XML element representing the repeal of this act, or nil
273
+ # The XML element representing the event of repeal of this act, or nil
274
+ #
275
+ # @return [Nokogiri::XML::Node]
220
276
  def repeal
221
277
  # <lifecycle source="#this">
222
278
  # <eventRef id="e1" date="2010-07-28" source="#original" type="generation"/>
@@ -226,11 +282,15 @@ module Slaw
226
282
  @meta.at_xpath('./a:lifecycle/a:eventRef[@type="repeal"]', a: NS)
227
283
  end
228
284
 
285
+ # The date at which this particular XML manifestation of this document was generated.
286
+ #
287
+ # @return [String] date, YYYY-MM-DD
229
288
  def manifestation_date
230
289
  node = @meta.at_xpath('./a:identification/a:FRBRManifestation/a:FRBRdate[@name="Generation"]', a: NS)
231
290
  node && node['date']
232
291
  end
233
292
 
293
+ # The underlying nature of this act, usually `act` although subclasses my override this.
234
294
  def nature
235
295
  "act"
236
296
  end
data/lib/slaw/bylaw.rb CHANGED
@@ -1,12 +1,19 @@
1
1
  require 'slaw/act'
2
2
 
3
3
  module Slaw
4
- # Wraps an AkomaNtoso XML document describing an Act classed as a By-Law
4
+ # An extension of {Slaw::Act} which wraps an AkomaNtoso XML document describing an By-Law.
5
+ #
6
+ # There are minor differences between Acts and By-laws, the most notable being that a by-law
7
+ # is not identified by a year and a number, and therefore has a different FRBR uri structure.
5
8
  class ByLaw < Act
6
9
 
7
- attr_accessor :region, :name
10
+ # [String] The region this by-law applies to
11
+ attr_accessor :region
12
+
13
+ # [String] A short file-like name of this by-law, unique within its year and region
14
+ attr_accessor :name
8
15
 
9
- def extract_id
16
+ def _extract_id
10
17
  # /za/by-law/cape-town/2010/public-parks
11
18
 
12
19
  @id_uri = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRuri', a: NS)['value']
@@ -22,30 +29,16 @@ module Slaw
22
29
  end
23
30
 
24
31
  def short_title
25
- unless @short_title
26
- node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
27
- if node
28
- @short_title = node['value']
29
- else
30
- @short_title = "(Unknown)"
31
- end
32
-
33
- if amended? and not @short_title.end_with?("as amended")
34
- @short_title = @short_title + " as amended"
35
- end
36
- end
32
+ node = @meta.at_xpath('./a:identification/a:FRBRWork/a:FRBRalias', a: NS)
33
+ short_title = node ? node['value'] : "(Unknown)"
37
34
 
38
- @short_title
39
- end
35
+ if amended? and not short_title.end_with?("as amended")
36
+ short_title = short_title + " as amended"
37
+ end
40
38
 
41
- def url_path
42
- "/#{@country}/by-law/#{@region}/#{@year}/#{@name}/"
39
+ short_title
43
40
  end
44
41
 
45
- def url_file
46
- @name
47
- end
48
-
49
42
  def nature
50
43
  "by-law"
51
44
  end
@@ -2,11 +2,28 @@ require 'forwardable'
2
2
 
3
3
  module Slaw
4
4
  # A collection of Act instances.
5
+ #
6
+ # This is useful for looking up acts by their FRBR uri and for
7
+ # loading a collection of XML act documents.
8
+ #
9
+ # This collection is enumerable and can be iterated over. Use {#items} to
10
+ # access the underlying array of objects.
11
+ #
12
+ # @example Load a collection of acts and then iterate over them.
13
+ #
14
+ # acts = Slaw::DocumentCollection.new
15
+ # acts.discover('/path/to/acts/')
16
+ #
17
+ # for act in acts
18
+ # puts act.short_name
19
+ # end
20
+ #
5
21
  class DocumentCollection
6
22
 
7
23
  include Enumerable
8
24
  extend Forwardable
9
25
 
26
+ # [Array<Act>] The underlying array of acts
10
27
  attr_accessor :items
11
28
 
12
29
  def_delegators :items, :each, :<<, :length
@@ -15,16 +32,27 @@ module Slaw
15
32
  @items = items || []
16
33
  end
17
34
 
18
- # Find all XML files in +path+ and return
19
- # a list of instances of +cls+.
35
+ # Find all XML files in `path` and add them into this
36
+ # collection.
37
+ #
38
+ # @param path [String] the path to glob for xml files
39
+ # @param cls [Class] the class to instantiate for each file
40
+ #
41
+ # @return [DocumentCollection] this collection
20
42
  def discover(path, cls=Slaw::Act)
21
43
  for fname in Dir.glob("#{path}/**/*.xml")
22
44
  @items << cls.new(fname)
23
45
  end
46
+
47
+ self
24
48
  end
25
49
 
26
50
  # Try to find an act who's FRBRuri matches this one,
27
51
  # returning nil on failure
52
+ #
53
+ # @param uri [String] the uri to look for
54
+ #
55
+ # @return [Act, nil] the act, or nil
28
56
  def for_uri(uri)
29
57
  return @items.find { |doc| doc.id_uri == uri }
30
58
  end
@@ -0,0 +1,93 @@
1
+ require 'open3'
2
+
3
+ module Slaw
4
+ module Extract
5
+
6
+ # Routines for extracting and cleaning up context from other formats, such as PDF.
7
+ #
8
+ # You may need to set the location of the `pdftotext` binary.
9
+ #
10
+ # On Mac OS X, use `brew install xpdf` or download from http://www.foolabs.com/xpdf/download.html
11
+ #
12
+ # On Heroku, you'll need to do some hoop jumping, see http://theprogrammingbutler.com/blog/archives/2011/07/28/running-pdftotext-on-heroku/
13
+ class Extractor
14
+ include Slaw::Logging
15
+
16
+ @@pdftotext_path = "pdftotext"
17
+
18
+ # Object with text cleaning helpers
19
+ attr_accessor :cleanser
20
+
21
+ def initialize
22
+ @cleanser = Slaw::Parse::Cleanser.new
23
+ end
24
+
25
+ # Extract text from a file and run cleanup on it.
26
+ #
27
+ # @param filename [String] filename to extract from
28
+ #
29
+ # @return [String] extracted text
30
+ def extract_from_file(filename)
31
+ ext = filename[-4..-1].downcase
32
+
33
+ case ext
34
+ when '.pdf'
35
+ extract_from_pdf(filename)
36
+ when '.txt'
37
+ extract_from_text(filename)
38
+ else
39
+ raise ArgumentError.new("Unsupported file type #{ext}")
40
+ end
41
+ end
42
+
43
+ # Extract text from a PDF
44
+ #
45
+ # @param filename [String] filename to extract from
46
+ #
47
+ # @return [String] extracted text
48
+ def extract_from_pdf(filename)
49
+ cmd = pdf_to_text_cmd(filename)
50
+ logger.info("Executing: #{cmd}")
51
+ stdout, status = Open3.capture2(*cmd)
52
+
53
+ if status == 0
54
+ cleanup(stdout)
55
+ else
56
+ nil
57
+ end
58
+ end
59
+
60
+ # Build a command for the external PDF-to-text utility.
61
+ #
62
+ # @param filename [String] the pdf file
63
+ #
64
+ # @return [Array<String>] command and params to execute
65
+ def pdf_to_text_cmd(filename)
66
+ [Extractor.pdftotext_path, "-enc", "UTF-8", filename, "-"]
67
+ end
68
+
69
+ def extract_from_text(filename)
70
+ cleanup(File.read(filename))
71
+ end
72
+
73
+ # Run general once-off cleanup of extracted text.
74
+ def cleanup(text)
75
+ text = @cleanser.cleanup(text)
76
+ text = @cleanser.remove_empty_lines(text)
77
+ text = @cleanser.reformat(text)
78
+
79
+ text
80
+ end
81
+
82
+ # Get location of the pdftotext executable for all instances.
83
+ def self.pdftotext_path
84
+ @@pdftotext_path
85
+ end
86
+
87
+ # Set location of the pdftotext executable for all instances.
88
+ def self.pdftotext_path=(val)
89
+ @@pdftotext_path = val
90
+ end
91
+ end
92
+ end
93
+ end
@@ -3,28 +3,36 @@ module Slaw
3
3
  module Blocklists
4
4
  include Slaw::Namespace
5
5
 
6
- # Correctly re-nest nested block lists.
6
+ # Correctly re-nest nested block lists. We do this by identifying the
7
+ # numbering format of each item in the list and comparing it with the
8
+ # surrounding elements. When the numbering format changes, we start
9
+ # a new nested list.
7
10
  #
8
- # (a)
9
- # (b)
10
- # (i)
11
- # (ii)
12
- # (aa)
13
- # (bb)
14
- # (c)
15
- # (d)
11
+ # We make sure to handle special cases such as `(i)` coming between
12
+ # `(h)` and `(j)` versus being at the start of a `(i), (ii), (iii)`
13
+ # list.
16
14
  #
17
- # becomes
18
- #
19
- # (a)
20
- # (b)
21
- # (i)
22
- # (ii)
15
+ # (a)
16
+ # (b)
17
+ # (i)
18
+ # (ii)
23
19
  # (aa)
24
20
  # (bb)
25
- # (c)
26
- # (d)
21
+ # (c)
22
+ # (d)
23
+ #
24
+ # becomes
25
+ #
26
+ # (a)
27
+ # (b)
28
+ # (i)
29
+ # (ii)
30
+ # (aa)
31
+ # (bb)
32
+ # (c)
33
+ # (d)
27
34
  #
35
+ # @param doc [Nokogiri::XML::Document] the document
28
36
  def self.nest_blocklists(doc)
29
37
  doc.xpath('//a:blockList', a: NS).each do |blocklist|
30
38
  items = blocklist.xpath('a:item', a: NS)
@@ -1,25 +1,67 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'builder'
2
4
  require 'treetop'
3
5
 
4
6
  module Slaw
5
7
  module Parse
6
- # Primary class for building Akoma Ntoso documents.
8
+ # The primary class for building Akoma Ntoso documents from plain text documents.
9
+ #
10
+ # The builder uses a grammar to break down a plain-text version of an act into a
11
+ # syntax tree. This tree can then be serialized into an Akoma Ntoso compatible
12
+ # XML document.
13
+ #
14
+ # @example Parse some text into a well-formed document
15
+ # builder = Slaw::Builder.new
16
+ # xml = builder.parse_text(text)
17
+ # doc = builder.parse_xml(xml)
18
+ # builder.postprocess(doc)
19
+ #
20
+ # @example A quicker way to build a well-formed document
21
+ # builder = Slaw::Builder.new
22
+ # doc = builder.parse_and_process_text(text)
7
23
  #
8
- # It can convert from plain text a new Akoma Ntoso document, or
9
- # update existing documents.
10
24
  class Builder
11
25
  include Slaw::Namespace
12
26
  include Slaw::Logging
13
27
 
14
28
  Treetop.load(File.dirname(__FILE__) + "/bylaw.treetop")
15
29
 
30
+ # [Hash] A Hash of options that are made available to the parser when parsing.
16
31
  attr_accessor :parse_options
17
32
 
18
- def initialize()
19
- @parse_options = {}
33
+ def initialize(parse_options={})
34
+ @parse_options = parse_options
35
+ end
36
+
37
+ # Do all the work necessary to parse text into a well-formed XML document.
38
+ #
39
+ # @param text [String] the text to parse
40
+ # @param root [Symbol] the root element of the grammar
41
+ #
42
+ # @return [Nokogiri::XML::Document] a well formed document
43
+ def parse_and_process_text(text, root=:bylaw)
44
+ postprocess(parse_xml(parse_text(text, root)))
45
+ end
46
+
47
+ # Parse text into XML. You should still run {#postprocess} on the
48
+ # resulting XML to normalise it.
49
+ #
50
+ # @param text [String] the text to parse
51
+ # @param root [Symbol] the root element of the grammar
52
+ #
53
+ # @return [String] an XML string
54
+ def parse_text(text, root=:bylaw)
55
+ tree = text_to_syntax_tree(text, root)
56
+ xml_from_syntax_tree(tree)
20
57
  end
21
58
 
22
- # Try to parse plain text into a syntax tree
59
+ # Parse plain text into a syntax tree.
60
+ #
61
+ # @param text [String] the text to parse
62
+ # @param root [Symbol] the root element of the grammar
63
+ #
64
+ # @return [Object] the root of the resulting parse tree, usually a Treetop::Node object
23
65
  def text_to_syntax_tree(text, root=:bylaw)
24
66
  parser = Slaw::Parse::BylawParser.new
25
67
  parser.options = @parse_options
@@ -35,7 +77,12 @@ module Slaw
35
77
  tree
36
78
  end
37
79
 
38
- # Generate an XML document from the given syntax tree.
80
+ # Generate an XML document from the given syntax tree. You should still
81
+ # run {#postprocess} on the resulting XML to normalise it.
82
+ #
83
+ # @param tree [Object] a Treetop::Node object
84
+ #
85
+ # @return [String] an XML string
39
86
  def xml_from_syntax_tree(tree)
40
87
  s = ""
41
88
  builder = ::Builder::XmlMarkup.new(indent: 2, target: s)
@@ -50,38 +97,41 @@ module Slaw
50
97
  s
51
98
  end
52
99
 
100
+ # Parse a string into a Nokogiri::XML::Document
101
+ #
102
+ # @param xml [String] string to parse
103
+ #
104
+ # @return [Nokogiri::XML::Document]
53
105
  def parse_xml(xml)
54
106
  Nokogiri::XML(xml, &:noblanks)
55
107
  end
56
108
 
109
+ # Serialise a Nokogiri::XML::Document into a string
110
+ #
111
+ # @param doc [Nokogiri::XML::Document] document
112
+ #
113
+ # @return [String] pretty printed string
57
114
  def to_xml(doc)
58
115
  doc.to_xml(indent: 2)
59
116
  end
60
117
 
61
- # Run various postprocesses on the XML, and return
62
- # the updated XML.
118
+ # Postprocess an XML document.
119
+ #
120
+ # @param doc [Nokogiri::XML::Document]
121
+ #
122
+ # @return [Nokogiri::XML::Document] the updated document
63
123
  def postprocess(doc)
64
124
  normalise_headings(doc)
65
125
  find_short_title(doc)
66
- sanitise(doc)
67
- end
68
-
69
- # Do sanitisations, such as finding and linking definitions
70
- def sanitise(doc)
71
126
  link_definitions(doc)
72
127
  nest_blocklists(doc)
73
- end
74
128
 
75
- # recalculate ids for <term> elements
76
- def renumber_terms(doc)
77
- logger.info("Renumbering terms")
78
-
79
- doc.xpath('//a:term', a: NS).each_with_index do |term, i|
80
- term['id'] = "trm#{i}"
81
- end
129
+ doc
82
130
  end
83
131
 
84
132
  # Change CAPCASE headings into Sentence case.
133
+ #
134
+ # @param doc [Nokogiri::XML::Document]
85
135
  def normalise_headings(doc)
86
136
  logger.info("Normalising headings")
87
137
 
@@ -94,6 +144,8 @@ module Slaw
94
144
  end
95
145
 
96
146
  # Find the short title and add it as an FRBRalias element in the meta section
147
+ #
148
+ # @param doc [Nokogiri::XML::Document]
97
149
  def find_short_title(doc)
98
150
  logger.info("Finding short title")
99
151
 
@@ -117,6 +169,8 @@ module Slaw
117
169
 
118
170
  # Find definitions of terms and introduce them into the
119
171
  # meta section of the document.
172
+ #
173
+ # @param doc [Nokogiri::XML::Document]
120
174
  def link_definitions(doc)
121
175
  logger.info("Finding and linking definitions")
122
176
 
@@ -126,6 +180,12 @@ module Slaw
126
180
  renumber_terms(doc)
127
181
  end
128
182
 
183
+ # Find `def` elements in the document and return a Hash from
184
+ # term ids to the text of each term
185
+ #
186
+ # @param doc [Nokogiri::XML::Document]
187
+ #
188
+ # @return [Hash{String, String}]
129
189
  def find_definitions(doc)
130
190
  guess_at_definitions(doc)
131
191
 
@@ -239,6 +299,21 @@ module Slaw
239
299
  end
240
300
  end
241
301
 
302
+ # recalculate ids for <term> elements
303
+ def renumber_terms(doc)
304
+ logger.info("Renumbering terms")
305
+
306
+ doc.xpath('//a:term', a: NS).each_with_index do |term, i|
307
+ term['id'] = "trm#{i}"
308
+ end
309
+ end
310
+
311
+ # Correctly nest blocklists.
312
+ #
313
+ # The grammar gives us flat blocklists, we need to introspect the
314
+ # numbering of the lists to correctly nest them.
315
+ #
316
+ # @param doc [Nokogiri::XML::Document]
242
317
  def nest_blocklists(doc)
243
318
  logger.info("Nesting blocklists")
244
319
 
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'slaw/parse/grammar_helpers'
2
4
 
3
5
  module Slaw
@@ -50,16 +50,17 @@ module Slaw
50
50
  .gsub(" ", '')
51
51
  end
52
52
 
53
+ # change weird quotes to normal ones
53
54
  def fix_quotes(s)
54
- # change weird quotes to normal ones
55
55
  s.gsub(/‘‘|’’|''/, '"')
56
56
  end
57
57
 
58
+ # tabs to spaces
58
59
  def expand_tabs(s)
59
- # tabs to spaces
60
60
  s.gsub(/\t/, ' ')
61
61
  end
62
62
 
63
+ # Try to remove boilerplate lines found in many files, such as page numbers.
63
64
  def remove_boilerplate(s)
64
65
  # nuke any line to do with Sabinet and the government printer
65
66
  s.gsub(/^.*Sabinet.*Government Printer.*$/i, '')\
@@ -72,6 +73,8 @@ module Slaw
72
73
  .gsub(/^\s*page \d+( of \d+)?\s*\n/i, '')
73
74
  end
74
75
 
76
+ # Get rid of whitespace at the end of lines and at the start and end of the
77
+ # entire string.
75
78
  def chomp(s)
76
79
  # trailing whitespace at end of lines
77
80
  s = s.gsub(/ +$/, '')
@@ -85,8 +88,11 @@ module Slaw
85
88
  s.end_with?("\n") ? s : (s + "\n")
86
89
  end
87
90
 
88
- # make educated guesses about lines that should
89
- # have been broken but haven't, and break them
91
+ # Make educated guesses about lines that should
92
+ # have been broken but haven't, and break them.
93
+ #
94
+ # This is very dependent on a locale's legislation grammar, there are
95
+ # lots of rules of thumb that make this work.
90
96
  def break_lines(s)
91
97
  # often we find a section title munged onto the same line as its first statement
92
98
  # eg:
@@ -115,8 +121,8 @@ module Slaw
115
121
  s
116
122
  end
117
123
 
118
- # finds likely candidates for unnecessarily broken lines
119
- # and them
124
+ # Find likely candidates for unnecessarily broken lines
125
+ # and unbreaks them.
120
126
  def unbreak_lines(s)
121
127
  lines = s.split(/\n/)
122
128
  output = []
@@ -141,8 +147,8 @@ module Slaw
141
147
  output.join("\n")
142
148
  end
143
149
 
144
- # do our best to remove table of contents at the start,
145
- # it really confuses the grammer
150
+ # Do our best to remove table of contents at the start,
151
+ # it really confuses the grammer.
146
152
  def strip_toc(s)
147
153
  # first, try to find 'TABLE OF CONTENTS' anywhere within the first 4K of text,
148
154
  if toc_start = s[0..4096].match(/TABLE OF CONTENTS/i)
@@ -1,5 +1,9 @@
1
1
  module Slaw
2
2
  module Parse
3
+ # These helpers are mixed into the treetop grammar and provide a means for
4
+ # exposing options into the grammar.
5
+ #
6
+ # @see Builder#parse_options
3
7
  module GrammarHelpers
4
8
  attr_writer :options
5
9
 
@@ -2,7 +2,14 @@ module Slaw
2
2
  module Render
3
3
 
4
4
  # Support for transforming XML AN documents into HTML.
5
+ #
6
+ # This rendering is done using XSLT stylesheets. Both an entire
7
+ # document and fragments can be rendered.
5
8
  class HTMLRenderer
9
+
10
+ # [Hash] A Hash of Nokogiri::XSLT objects
11
+ attr_accessor :xslt
12
+
6
13
  def initialize
7
14
  here = File.dirname(__FILE__)
8
15
 
@@ -12,12 +19,17 @@ module Slaw
12
19
  }
13
20
  end
14
21
 
15
- # Transform an entire XML document +doc+ (a Nokogiri::XML::Document object) into HTML.
16
- # Specify +base_url+ to manage the base for relative URLs generated by
22
+ # Transform an entire XML document (a Nokogiri::XML::Document object) into HTML.
23
+ # Specify `base_url` to manage the base for relative URLs generated by
17
24
  # the transform.
25
+ #
26
+ # @param doc [Nokogiri::XML::Document] document to render
27
+ # @param base_url [String] root URL for relative URLs (cannot be empty)
28
+ #
29
+ # @return [String]
18
30
  def render(doc, base_url='')
19
- params = transform_params({'base_url' => base_url})
20
- run_xslt(:act, doc, params)
31
+ params = _transform_params({'base_url' => base_url})
32
+ _run_xslt(:act, doc, params)
21
33
  end
22
34
 
23
35
  # Transform just a single node and its children into HTML.
@@ -25,8 +37,13 @@ module Slaw
25
37
  # If +elem+ has an id, we use xpath to tell the XSLT which
26
38
  # element to transform. Otherwise we copy the node into a new
27
39
  # tree and apply the XSLT to that.
40
+ #
41
+ # @param node [Nokogiri::XML::Node] node to render
42
+ # @param base_url [String] root URL for relative URLs (cannot be empty)
43
+ #
44
+ # @return [String]
28
45
  def render_node(node, base_url='')
29
- params = transform_params({'base_url' => base_url})
46
+ params = _transform_params({'base_url' => base_url})
30
47
 
31
48
  if node.id
32
49
  params += ['root_elem', "//*[@id='#{node.id}']"]
@@ -38,14 +55,14 @@ module Slaw
38
55
  params += ['root_elem', '*']
39
56
  end
40
57
 
41
- run_xslt(:fragment, doc, params)
58
+ _run_xslt(:fragment, doc, params)
42
59
  end
43
60
 
44
- def run_xslt(xslt, doc, params)
61
+ def _run_xslt(xslt, doc, params)
45
62
  @xslt[xslt].transform(doc, params).to_s
46
63
  end
47
64
 
48
- def transform_params(params)
65
+ def _transform_params(params)
49
66
  Nokogiri::XSLT.quote_params(params)
50
67
  end
51
68
  end
@@ -77,7 +77,7 @@
77
77
  <xsl:value-of select="@refersTo" />
78
78
  </xsl:attribute>
79
79
 
80
- <xsl:attribute name="href"><xsl:value-of select="$base_url" />definitions/#def-<xsl:value-of select="translate(@refersTo, '#', '')" /></xsl:attribute>
80
+ <xsl:attribute name="href"><xsl:value-of select="$base_url" />/definitions/#def-<xsl:value-of select="translate(@refersTo, '#', '')" /></xsl:attribute>
81
81
 
82
82
  <xsl:apply-templates />
83
83
  </a>
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
data/slaw.gemspec CHANGED
@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rspec", "~> 2.14.1"
24
24
 
25
25
  spec.add_runtime_dependency "nokogiri", "~> 1.6.0"
26
- spec.add_runtime_dependency "elasticsearch", "~> 1.0.5"
27
26
  spec.add_runtime_dependency "treetop", "~> 1.5"
28
27
  spec.add_runtime_dependency "builder", "~> 3.2.2"
29
28
  spec.add_runtime_dependency "log4r", "~> 1.1.10"
@@ -0,0 +1,14 @@
1
+ require 'tempfile'
2
+
3
+ require 'spec_helper'
4
+ require 'slaw'
5
+
6
+ describe Slaw::Extract::Extractor do
7
+ it 'should extract from plain text' do
8
+ f = Tempfile.new(['test', '.txt'])
9
+ f.write('This is some text')
10
+ f.rewind
11
+
12
+ subject.extract_from_file(f.path).should == "This is some text\n"
13
+ end
14
+ end
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'spec_helper'
2
4
  require 'slaw'
3
5
 
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'slaw'
2
4
  require 'builder'
3
5
 
@@ -1,3 +1,5 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'spec_helper'
2
4
 
3
5
  require 'slaw'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-17 00:00:00.000000000 Z
11
+ date: 2014-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,20 +66,6 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 1.6.0
69
- - !ruby/object:Gem::Dependency
70
- name: elasticsearch
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: 1.0.5
76
- type: :runtime
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - "~>"
81
- - !ruby/object:Gem::Version
82
- version: 1.0.5
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: treetop
85
71
  requirement: !ruby/object:Gem::Requirement
@@ -131,6 +117,7 @@ extensions: []
131
117
  extra_rdoc_files: []
132
118
  files:
133
119
  - ".gitignore"
120
+ - ".travis.yml"
134
121
  - Gemfile
135
122
  - LICENSE.txt
136
123
  - README.md
@@ -139,7 +126,7 @@ files:
139
126
  - lib/slaw/act.rb
140
127
  - lib/slaw/bylaw.rb
141
128
  - lib/slaw/collection.rb
142
- - lib/slaw/elasticsearch.rb
129
+ - lib/slaw/extract/extractor.rb
143
130
  - lib/slaw/lifecycle_event.rb
144
131
  - lib/slaw/logging.rb
145
132
  - lib/slaw/namespace.rb
@@ -157,6 +144,7 @@ files:
157
144
  - lib/slaw/version.rb
158
145
  - lib/slaw/xml_support.rb
159
146
  - slaw.gemspec
147
+ - spec/extract/extractor_spec.rb
160
148
  - spec/parse/builder_spec.rb
161
149
  - spec/parse/bylaw_spec.rb
162
150
  - spec/parse/cleanser_spec.rb
@@ -187,6 +175,7 @@ signing_key:
187
175
  specification_version: 4
188
176
  summary: A lightweight library for using Akoma Ntoso acts in Ruby.
189
177
  test_files:
178
+ - spec/extract/extractor_spec.rb
190
179
  - spec/parse/builder_spec.rb
191
180
  - spec/parse/bylaw_spec.rb
192
181
  - spec/parse/cleanser_spec.rb
@@ -1,107 +0,0 @@
1
- require 'elasticsearch'
2
- require 'log4r'
3
-
4
- module Slaw
5
- # Support for indexing and search using elasticsearch
6
- class ElasticSearchSupport
7
- attr_accessor :es, :mapping, :index, :type, :base_url
8
-
9
- def initialize(index, type, base_url, client_params={}, es=nil)
10
- @es = es || create_client(client_params)
11
-
12
- @ix = index
13
- @type = type
14
- @base_url = base_url
15
-
16
- @mapping = {
17
- frbr_uri: {type: 'string', index: 'not_analyzed'},
18
- url: {type: 'string', index: 'not_analyzed'},
19
- title: {type: 'string', analyzer: 'english'},
20
- content: {type: 'string', analyzer: 'english'},
21
- published_on: {type: 'date', format: 'dateOptionalTime'},
22
- region: {type: 'string', index: 'not_analyzed'},
23
- region_name: {type: 'string', index: 'not_analyzed'},
24
- repealed: {type: 'boolean'},
25
- }
26
-
27
- @log = Log4r::Logger['Slaw']
28
- end
29
-
30
- def create_client(client_params)
31
- Elasticsearch::Client.new(client_params)
32
- end
33
-
34
- def reindex!(docs, &block)
35
- define_mapping!
36
- index_documents!(docs, &block)
37
- end
38
-
39
- def index_documents!(docs, &block)
40
- for doc in docs
41
- id = doc.id_uri.gsub('/', '-')
42
-
43
- data = {
44
- frbr_uri: doc.id_uri,
45
- url: @base_url + doc.id_uri,
46
- title: doc.short_title,
47
- content: doc.body.text,
48
- region: doc.region,
49
- published_on: doc.publication['date'],
50
- repealed: doc.repealed?,
51
- }
52
-
53
- yield doc, data if block_given?
54
-
55
- @log.info("Indexing #{id}")
56
- @es.index(index: @ix, type: @type, id: id, body: data)
57
- end
58
- end
59
-
60
- def define_mapping!
61
- @log.info("Deleting index")
62
- @es.indices.create(index: @ix) unless @es.indices.exists(index: @ix)
63
-
64
- # delete existing mapping
65
- unless @es.indices.get_mapping(index: @ix, type: @type).empty?
66
- @es.indices.delete_mapping(index: @ix, type: @type)
67
- end
68
-
69
- @log.info("Defining mappings")
70
- @es.indices.put_mapping(index: @ix, type: @type, body: {
71
- @type => {properties: @mapping}
72
- })
73
- end
74
-
75
- def search(q, from=0, size=10)
76
- @es.search(index: @ix, body: {
77
- query: {
78
- multi_match: {
79
- query: q,
80
- type: 'cross_fields',
81
- fields: ['title', 'content'],
82
- }
83
- },
84
- fields: ['frbr_uri', 'repealed', 'published_on', 'title', 'url', 'region_name'],
85
- highlight: {
86
- order: "score",
87
- fields: {
88
- content: {
89
- fragment_size: 80,
90
- number_of_fragments: 2,
91
- },
92
- title: {
93
- number_of_fragments: 0, # entire field
94
- }
95
- },
96
- pre_tags: ['<mark>'],
97
- post_tags: ['</mark>'],
98
- },
99
- from: from,
100
- size: size,
101
- sort: {
102
- '_score' => {order: 'desc'}
103
- }
104
- })
105
- end
106
- end
107
- end