slaw 0.17.2 → 1.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 60e83b6293761721e7a2213e9e5c668accc5873c
4
- data.tar.gz: b8e31cc3f17512a32af5426a86f87c68c14a0b45
3
+ metadata.gz: e5c6f9929b92711f0a7c608387b49784c8c0198c
4
+ data.tar.gz: 7557e00919085931eae2ce9195bbe55342a50c9c
5
5
  SHA512:
6
- metadata.gz: efad800e07b95ae255fe44ccf118f5dc43793b88394d5eec87b59ddc30ef0222aabbd2e513c7a5dd3c6b34a271e77dccb599f10e2adf8c2faa0f0a83c67f65bf
7
- data.tar.gz: c5c44d83a2c736f62122a96301435517999ba008768f3d54642b828f9897163172da7202691bd32a11a5427613f90a1ae0e5863ae375f6a3bcdf0514616b4fad
6
+ metadata.gz: 386d5e7195f18838c00af784a46792f349913ac5fe07f7799e4e3055fd6720da3e1dcb6b2e10b93d14e2519758eecdd4b46fe31ceecf2bf10b707cf51f6e93dd
7
+ data.tar.gz: 73aa8a5060a8933a09bec8ebac589aeb835de216a76f7865f9bf5883eb89ab347117c648eda8e806eaa668d4f9c1ce7eefeb4bd9dd678abbcb1a200986d98063
data/README.md CHANGED
@@ -8,7 +8,7 @@ Slaw allows you to:
8
8
 
9
9
  1. extract plain text from PDFs and clean up that text
10
10
  2. parse plain text and transform it into an Akoma Ntoso Act XML document
11
- 3. render the XML document into HTML
11
+ 3. unparse Akoma Ntoso XML into text that can be parsed backed into Akoma Ntoso.
12
12
 
13
13
  Slaw is lightweight because it wraps around a Nokogiri XML representation of
14
14
  the parsed document. It provides some support methods for manipulating these
@@ -61,7 +61,7 @@ formats.
61
61
 
62
62
  The grammar cannot catch some subtleties of an act or by-law -- such as nested list numbering --
63
63
  so Slaw performs some post-processing on the XML produced by the parser. In particular,
64
- it nests lists correctly and looks for specially defined terms and their occurrences in the document.
64
+ it nests lists correctly.
65
65
 
66
66
  ## Quick Start
67
67
 
@@ -218,6 +218,13 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
218
218
 
219
219
  ## Changelog
220
220
 
221
+ ### 1.0.0
222
+
223
+ * Improved support for other legal traditions / grammars.
224
+ * Add Polish legal tradition grammar.
225
+ * Slaw no longer does too much introspection of a parsed document, since that can be so tradition-dependent.
226
+ * Remove definition linking, Slaw no longer supports it.
227
+
221
228
  ### 0.17.2
222
229
 
223
230
  * Match defined terms in 'definition' section.
data/bin/slaw CHANGED
@@ -17,19 +17,15 @@ class SlawCLI < Thor
17
17
  desc "parse FILE", "Parse FILE into Akoma Ntoso XML"
18
18
  option :input, enum: ['text', 'pdf'], desc: "Type of input if it can't be determined automatically"
19
19
  option :pdftotext, desc: "Location of the pdftotext binary if not in PATH"
20
- option :definitions, type: :boolean, desc: "Find and link definitions (this can be slow). Default: false"
21
20
  option :fragment, type: :string, desc: "Akoma Ntoso element name that the imported text represents. Support depends on the grammar."
22
21
  option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
23
22
  option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
24
23
  option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
25
24
  option :crop, type: :string, desc: "Crop box for PDF files, as 'left,top,width,height'."
25
+ option :grammar, type: :string, desc: "Grammar name (usually a two-letter country code). Default is za."
26
26
  def parse(name)
27
27
  logging
28
28
 
29
- if options[:fragment] and options[:definitions]
30
- raise Thor::Error.new("--definitions can't be used together with --fragment")
31
- end
32
-
33
29
  Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
34
30
  extractor = Slaw::Extract::Extractor.new
35
31
 
@@ -50,7 +46,7 @@ class SlawCLI < Thor
50
46
  text = extractor.extract_from_file(name)
51
47
  end
52
48
 
53
- generator = Slaw::ActGenerator.new
49
+ generator = Slaw::ActGenerator.new(options[:grammar] || 'za')
54
50
 
55
51
  text = generator.reformat(text) if options[:reformat]
56
52
 
@@ -94,22 +90,9 @@ class SlawCLI < Thor
94
90
  exit 1
95
91
  end
96
92
 
97
- # definitions?
98
- generator.builder.link_definitions(act.doc) if options[:definitions]
99
-
100
93
  puts act.to_xml(indent: 2)
101
94
  end
102
95
 
103
- desc "link-definitions FILE", "Find and link defined terms in FILE"
104
- def link_definitions(name)
105
- builder = Slaw::ActGenerator.new.builder
106
-
107
- doc = File.open(name, 'r') { |f| doc = builder.parse_xml(f.read) }
108
- builder.link_definitions(doc)
109
-
110
- puts builder.to_xml(doc)
111
- end
112
-
113
96
  desc "unparse FILE", "Unparse FILE from Akoma Ntoso XML back into text suitable for re-parsing"
114
97
  def unparse(name)
115
98
  generator = Slaw::ActGenerator.new
@@ -1,8 +1,6 @@
1
1
  module Slaw
2
2
  # Base class for generating Act documents
3
3
  class ActGenerator
4
- Treetop.load(File.dirname(__FILE__) + "/za/act.treetop")
5
-
6
4
  # [Treetop::Runtime::CompiledParser] compiled parser
7
5
  attr_accessor :parser
8
6
 
@@ -12,13 +10,31 @@ module Slaw
12
10
  # The type that will hold the generated document
13
11
  attr_accessor :document_class
14
12
 
15
- def initialize
16
- @parser = Slaw::ZA::ActParser.new
13
+ @@parsers = {}
14
+
15
+ def initialize(grammar)
16
+ @grammar = grammar
17
+
18
+ @parser = build_parser
17
19
  @builder = Slaw::Parse::Builder.new(parser: @parser)
20
+ @parser = @builder.parser
18
21
  @cleanser = Slaw::Parse::Cleanser.new
19
22
  @document_class = Slaw::Act
20
23
  end
21
24
 
25
+ def build_parser
26
+ unless @@parsers[@grammar]
27
+ # load the grammar
28
+ grammar_file = File.dirname(__FILE__) + "/grammars/#{@grammar}/act.treetop"
29
+ Treetop.load(grammar_file)
30
+
31
+ grammar_class = "Slaw::Grammars::#{@grammar.upcase}::ActParser"
32
+ @@parsers[@grammar] = eval(grammar_class)
33
+ end
34
+
35
+ @parser = @@parsers[@grammar].new
36
+ end
37
+
22
38
  # Generate a Slaw::Act instance from plain text.
23
39
  #
24
40
  # @param text [String] plain text
@@ -66,8 +82,7 @@ module Slaw
66
82
  # Transform an Akoma Ntoso XML document back into a plain-text version
67
83
  # suitable for re-parsing back into XML with no loss of structure.
68
84
  def text_from_act(doc)
69
- here = File.dirname(__FILE__)
70
- xslt = Nokogiri::XSLT(File.read(File.join([here, 'za/act_text.xsl'])))
85
+ xslt = Nokogiri::XSLT(File.read(File.join([File.dirname(__FILE__), "grammars/#{@grammar}/act_text.xsl"])))
71
86
  xslt.transform(doc).child.to_xml
72
87
  end
73
88
  end
@@ -0,0 +1,17 @@
1
+ module Slaw
2
+ module Grammars
3
+ class GroupNode < Treetop::Runtime::SyntaxNode
4
+ def to_xml(b, *args)
5
+ children.elements.each { |e| e.to_xml(b, *args) }
6
+ end
7
+ end
8
+
9
+ class Body < Treetop::Runtime::SyntaxNode
10
+ def to_xml(b)
11
+ b.body { |b|
12
+ children.elements.each_with_index { |e, i| e.to_xml(b, '', i) }
13
+ }
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,45 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'slaw/grammars/terminals'
4
+ require 'slaw/grammars/inlines_nodes'
5
+
6
+ module Slaw
7
+ module Grammars
8
+ grammar Inlines
9
+ ##########
10
+ # inline content
11
+
12
+ rule inline_statement
13
+ space? '\\'? clauses eol
14
+ <NakedStatement>
15
+ end
16
+
17
+ # one or more words, allowing inline elements
18
+ rule clauses
19
+ (remark / image / ref / [^\n])+
20
+ <Clauses>
21
+ end
22
+
23
+ rule remark
24
+ '[[' content:(ref / (!']]' .))+ ']]'
25
+ <Remark>
26
+ end
27
+
28
+ rule image
29
+ # images like markdown
30
+ # eg. ![title text](image url)
31
+ #
32
+ # the title text is optional, but the enclosing square brackets aren't
33
+ '![' content:(!'](' [^\n])* '](' href:([^)\n]+) ')'
34
+ <Image>
35
+ end
36
+
37
+ rule ref
38
+ # links like markdown
39
+ # eg. [link text](link url)
40
+ '[' content:(!'](' [^\n])+ '](' href:([^)\n]+) ')'
41
+ <Ref>
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,58 @@
1
+ module Slaw
2
+ module Grammars
3
+ module Inlines
4
+ class NakedStatement < Treetop::Runtime::SyntaxNode
5
+ def to_xml(b, idprefix, i=0)
6
+ b.p { |b| clauses.to_xml(b, idprefix) } if clauses
7
+ end
8
+
9
+ def content
10
+ clauses
11
+ end
12
+ end
13
+
14
+ class Clauses < Treetop::Runtime::SyntaxNode
15
+ def to_xml(b, idprefix=nil)
16
+ for e in elements
17
+ if e.respond_to? :to_xml
18
+ e.to_xml(b, idprefix)
19
+ else
20
+ b << e.text_value
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ class Remark < Treetop::Runtime::SyntaxNode
27
+ def to_xml(b, idprefix)
28
+ b.remark(status: 'editorial') do |b|
29
+ b << '['
30
+ for e in content.elements
31
+ if e.respond_to? :to_xml
32
+ e.to_xml(b, idprefix)
33
+ else
34
+ b << e.text_value
35
+ end
36
+ end
37
+ b << ']'
38
+ end
39
+ end
40
+ end
41
+
42
+ class Image < Treetop::Runtime::SyntaxNode
43
+ def to_xml(b, idprefix)
44
+ attrs = {src: href.text_value}
45
+ attrs[:alt] = content.text_value unless content.text_value.empty?
46
+ b.img(attrs)
47
+ end
48
+ end
49
+
50
+ class Ref < Treetop::Runtime::SyntaxNode
51
+ def to_xml(b, idprefix)
52
+ b.ref(content.text_value, href: href.text_value)
53
+ end
54
+ end
55
+
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,246 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'slaw/parse/grammar_helpers'
4
+ require 'slaw/grammars/pl/act_nodes'
5
+
6
+ require 'slaw/grammars/terminals'
7
+ require 'slaw/grammars/tables'
8
+ require 'slaw/grammars/schedules'
9
+ require 'slaw/grammars/inlines'
10
+
11
+ module Slaw
12
+ module Grammars
13
+ module PL
14
+ grammar Act
15
+ include Slaw::Parse::GrammarHelpers
16
+
17
+ ########
18
+ # major containers
19
+
20
+ rule act
21
+ empty_line*
22
+ preface:preface?
23
+ preamble:preamble?
24
+ body
25
+ schedules:schedules_container? <Act>
26
+ end
27
+
28
+ rule preface
29
+ !'PREAMBLE'
30
+ ('PREFACE'i space? eol)?
31
+ statements:(!'PREAMBLE' pre_body_statement)* <Preface>
32
+ end
33
+
34
+ rule preamble
35
+ 'PREAMBLE'i space? eol
36
+ statements:pre_body_statement* <Preamble>
37
+ end
38
+
39
+ rule body
40
+ children:(division / subdivision / chapter / article / section / paragraph / point / litera / block_paragraphs)+ <Body>
41
+ end
42
+
43
+ rule division
44
+ heading:division_heading
45
+ children:(subdivision / chapter / article / section / paragraph / point / litera / block_paragraphs)*
46
+ <Division>
47
+ end
48
+
49
+ rule subdivision
50
+ heading:subdivision_heading
51
+ children:(chapter / article / section / paragraph / point / litera / block_paragraphs)*
52
+ <Subdivision>
53
+ end
54
+
55
+ rule chapter
56
+ heading:chapter_heading
57
+ children:(article / section / paragraph / point / litera / block_paragraphs)*
58
+ <Chapter>
59
+ end
60
+
61
+ rule article
62
+ # Art. 55. 1. something
63
+ article_prefix whitespace
64
+ intro:block_element? eol?
65
+ children:(section / paragraph / point / litera / block_paragraphs)* <Article>
66
+ end
67
+
68
+ rule section
69
+ # § 55. foo
70
+ section_prefix whitespace
71
+ intro:block_element? eol?
72
+ children:(paragraph / point / litera / block_paragraphs)* <Section>
73
+ end
74
+
75
+ rule paragraph
76
+ # ustęp:
77
+ # 34. ...
78
+ paragraph_prefix space?
79
+ intro:block_element? eol?
80
+ children:(point / litera / block_paragraphs)* <Paragraph>
81
+ end
82
+
83
+ rule point
84
+ # 12) aoeuaoeu
85
+ # 12a) aoeuaoeu
86
+ point_prefix whitespace
87
+ intro:block_element? eol?
88
+ children:(litera / block_paragraphs)* <Point>
89
+ end
90
+
91
+ rule litera
92
+ # a) aoeuaoeu
93
+ litera_prefix whitespace
94
+ intro:block_element? eol?
95
+ children:block_paragraphs* <Litera>
96
+ end
97
+
98
+ ##########
99
+ # group elements
100
+ #
101
+ # these are used externally and provide support when parsing just
102
+ # a particular portion of a document
103
+
104
+ rule divisions
105
+ children:division+ <GroupNode>
106
+ end
107
+
108
+ rule subdivisions
109
+ children:subdivision+ <GroupNode>
110
+ end
111
+
112
+ rule chapters
113
+ children:chapter+ <GroupNode>
114
+ end
115
+
116
+ rule articles
117
+ children:article+ <GroupNode>
118
+ end
119
+
120
+ rule sections
121
+ children:section+ <GroupNode>
122
+ end
123
+
124
+ rule paragraphs
125
+ children:paragraph+ <GroupNode>
126
+ end
127
+
128
+ rule points
129
+ children:point+ <GroupNode>
130
+ end
131
+
132
+ ##########
133
+ # headings
134
+
135
+ rule division_heading
136
+ space? prefix:division_heading_prefix heading:(newline? content)? eol
137
+ <GenericHeading>
138
+ end
139
+
140
+ rule subdivision_heading
141
+ space? prefix:subdivision_heading_prefix heading:(newline? content)? eol
142
+ <GenericHeading>
143
+ end
144
+
145
+ rule chapter_heading
146
+ space? prefix:chapter_heading_prefix heading:(newline? content)? eol
147
+ <GenericHeading>
148
+ end
149
+
150
+ ##########
151
+ # blocks of content inside containers
152
+
153
+ rule block_paragraphs
154
+ block_element+ <BlockParagraph>
155
+ end
156
+
157
+ rule block_element
158
+ # XXX: blocklist
159
+ (table / naked_statement)
160
+ end
161
+
162
+ # Block elements that don't have to appear at the start of a line.
163
+ # ie. we don't need to guard against the start of a chapter, section, etc.
164
+ rule inline_block_element
165
+ # XXX: blocklist
166
+ (table / inline_statement)
167
+ end
168
+
169
+ rule blocklist
170
+ blocklist_item+ <Blocklist>
171
+ end
172
+
173
+ rule blocklist_item
174
+ # TODO: this whitespace should probably be space, to allow empty blocklist items followed by plain text
175
+ space? blocklist_item_prefix whitespace item_content:(!blocklist_item_prefix clauses:clauses? eol)? eol?
176
+ <BlocklistItem>
177
+ end
178
+
179
+ rule blocklist_item_prefix
180
+ ('(' letter_ordinal ')') / dotted_number_3
181
+ end
182
+
183
+ ##########
184
+ # statements - single lines of content
185
+ #
186
+ # If a statement starts with a backslash, it's considered to have escaped the subsequent word,
187
+ # and is ignored. This allows escaping of section headings, etc.
188
+
189
+ rule naked_statement
190
+ space? !(division_heading / subdivision_heading / chapter_heading / article_prefix / section_prefix / schedule_title / paragraph_prefix / point_prefix / litera_prefix) '\\'? clauses eol
191
+ <NakedStatement>
192
+ end
193
+
194
+ rule pre_body_statement
195
+ space? !(division_heading / subdivision_heading / chapter_heading / article_prefix / section_prefix / schedule_title) '\\'? clauses eol
196
+ <NakedStatement>
197
+ end
198
+
199
+ ##########
200
+ # prefixes
201
+
202
+ rule division_heading_prefix
203
+ 'dzia'i ('ł'/'Ł') space alphanums [ :-]*
204
+ end
205
+
206
+ rule subdivision_heading_prefix
207
+ 'oddzia'i ('ł'/'Ł') space alphanums [ :.-]*
208
+ end
209
+
210
+ rule chapter_heading_prefix
211
+ 'rozdzia'i ('ł'/'Ł') space alphanums [ :.-]*
212
+ end
213
+
214
+ rule article_prefix
215
+ ('Art.'i / ('Artyku'i 'ł'/'Ł')) space number_letter '.'?
216
+ end
217
+
218
+ rule section_prefix
219
+ '§' space alphanums '.'?
220
+ end
221
+
222
+ rule paragraph_prefix
223
+ number_letter '.'
224
+ end
225
+
226
+ rule point_prefix
227
+ # 1) foo
228
+ # 2A) foo
229
+ number_letter ')'
230
+ end
231
+
232
+ rule litera_prefix
233
+ # a) foo
234
+ # bb) foo
235
+ letters:letter+ ')'
236
+ end
237
+
238
+ include Slaw::Grammars::Inlines
239
+ include Slaw::Grammars::Tables
240
+ include Slaw::Grammars::Schedules
241
+ include Slaw::Grammars::Terminals
242
+ end
243
+ end
244
+ end
245
+ end
246
+