slaw 0.17.2 → 1.0.0.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -2
- data/bin/slaw +2 -19
- data/lib/slaw/generator.rb +21 -6
- data/lib/slaw/grammars/core_nodes.rb +17 -0
- data/lib/slaw/grammars/inlines.treetop +45 -0
- data/lib/slaw/grammars/inlines_nodes.rb +58 -0
- data/lib/slaw/grammars/pl/act.treetop +246 -0
- data/lib/slaw/grammars/pl/act_nodes.rb +469 -0
- data/lib/slaw/grammars/schedules.treetop +33 -0
- data/lib/slaw/grammars/schedules_nodes.rb +107 -0
- data/lib/slaw/grammars/tables.treetop +59 -0
- data/lib/slaw/grammars/tables_nodes.rb +74 -0
- data/lib/slaw/grammars/terminals.treetop +84 -0
- data/lib/slaw/grammars/za/act.treetop +222 -0
- data/lib/slaw/grammars/za/act_nodes.rb +307 -0
- data/lib/slaw/{za → grammars/za}/act_text.xsl +0 -0
- data/lib/slaw/parse/builder.rb +6 -202
- data/lib/slaw/version.rb +1 -1
- data/spec/generator_spec.rb +2 -0
- data/spec/parse/builder_spec.rb +0 -48
- data/spec/pl/act_block_spec.rb +449 -0
- data/spec/za/act_block_spec.rb +5 -3
- data/spec/za/act_inline_spec.rb +2 -0
- data/spec/za/act_schedules_spec.rb +2 -0
- data/spec/za/act_table_spec.rb +2 -0
- metadata +19 -7
- data/lib/slaw/za/act.treetop +0 -393
- data/lib/slaw/za/act_nodes.rb +0 -532
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'slaw/grammars/schedules_nodes'
|
4
|
+
|
5
|
+
module Slaw
|
6
|
+
module Grammars
|
7
|
+
grammar Schedules
|
8
|
+
rule schedules_container
|
9
|
+
schedules:schedules <ScheduleContainer>
|
10
|
+
end
|
11
|
+
|
12
|
+
rule schedules
|
13
|
+
children:schedule+ <GroupNode>
|
14
|
+
end
|
15
|
+
|
16
|
+
rule schedule
|
17
|
+
schedule_title
|
18
|
+
body:body?
|
19
|
+
<Schedule>
|
20
|
+
end
|
21
|
+
|
22
|
+
rule schedule_title
|
23
|
+
space? schedule_title_prefix space? "\""? num:alphanums? "\""? [ \t:.-]* title:(content)?
|
24
|
+
heading:(newline space? content)?
|
25
|
+
eol
|
26
|
+
end
|
27
|
+
|
28
|
+
rule schedule_title_prefix
|
29
|
+
'schedule'i 's'i?
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'slaw/grammars/core_nodes'
|
2
|
+
|
3
|
+
module Slaw
|
4
|
+
module Grammars
|
5
|
+
module Schedules
|
6
|
+
FRBR_URI = '/za/act/1980/01'
|
7
|
+
WORK_URI = FRBR_URI
|
8
|
+
EXPRESSION_URI = "#{FRBR_URI}/eng@"
|
9
|
+
MANIFESTATION_URI = EXPRESSION_URI
|
10
|
+
|
11
|
+
class ScheduleContainer < Treetop::Runtime::SyntaxNode
|
12
|
+
def to_xml(b)
|
13
|
+
b.components { |b|
|
14
|
+
schedules.children.elements.each_with_index { |e, i|
|
15
|
+
e.to_xml(b, "", i+1)
|
16
|
+
}
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Schedule < Treetop::Runtime::SyntaxNode
|
22
|
+
def num
|
23
|
+
n = schedule_title.num.text_value
|
24
|
+
return (n && !n.empty?) ? n : nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def alias
|
28
|
+
if not schedule_title.title.text_value.blank?
|
29
|
+
schedule_title.title.text_value
|
30
|
+
elsif num
|
31
|
+
"Schedule #{num}"
|
32
|
+
else
|
33
|
+
"Schedule"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def heading
|
38
|
+
if schedule_title.heading.respond_to? :content
|
39
|
+
schedule_title.heading.content.text_value
|
40
|
+
else
|
41
|
+
nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_xml(b, idprefix=nil, i=1)
|
46
|
+
if num
|
47
|
+
n = num
|
48
|
+
component = "schedule#{n}"
|
49
|
+
else
|
50
|
+
n = i
|
51
|
+
# make a component name from the schedule title
|
52
|
+
component = self.alias.downcase().strip().gsub(/[^a-z0-9]/i, '').gsub(/ +/, '')
|
53
|
+
end
|
54
|
+
|
55
|
+
id = "#{idprefix}#{component}"
|
56
|
+
|
57
|
+
b.component(id: "component-#{id}") { |b|
|
58
|
+
b.doc_(name: component) { |b|
|
59
|
+
b.meta { |b|
|
60
|
+
b.identification(source: "#slaw") { |b|
|
61
|
+
b.FRBRWork { |b|
|
62
|
+
b.FRBRthis(value: "#{WORK_URI}/#{component}")
|
63
|
+
b.FRBRuri(value: WORK_URI)
|
64
|
+
b.FRBRalias(value: self.alias)
|
65
|
+
b.FRBRdate(date: '1980-01-01', name: 'Generation')
|
66
|
+
b.FRBRauthor(href: '#council')
|
67
|
+
b.FRBRcountry(value: 'za')
|
68
|
+
}
|
69
|
+
b.FRBRExpression { |b|
|
70
|
+
b.FRBRthis(value: "#{EXPRESSION_URI}/#{component}")
|
71
|
+
b.FRBRuri(value: EXPRESSION_URI)
|
72
|
+
b.FRBRdate(date: '1980-01-01', name: 'Generation')
|
73
|
+
b.FRBRauthor(href: '#council')
|
74
|
+
b.FRBRlanguage(language: 'eng')
|
75
|
+
}
|
76
|
+
b.FRBRManifestation { |b|
|
77
|
+
b.FRBRthis(value: "#{MANIFESTATION_URI}/#{component}")
|
78
|
+
b.FRBRuri(value: MANIFESTATION_URI)
|
79
|
+
b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
|
80
|
+
b.FRBRauthor(href: '#slaw')
|
81
|
+
}
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
b.mainBody { |b|
|
86
|
+
idprefix = "#{id}."
|
87
|
+
|
88
|
+
# there is no good AKN hierarchy container for schedules, so we
|
89
|
+
# just use article because we don't use it anywhere else.
|
90
|
+
b.article(id: id) { |b|
|
91
|
+
b.heading(heading) if heading
|
92
|
+
body.children.elements.each_with_index { |e| e.to_xml(b, idprefix, i) } if body.is_a? Body
|
93
|
+
}
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class ScheduleStatement < Treetop::Runtime::SyntaxNode
|
101
|
+
def to_xml(b, idprefix)
|
102
|
+
b.p { |b| clauses.to_xml(b, idprefix) } if clauses
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'slaw/grammars/terminals'
|
4
|
+
require 'slaw/grammars/tables_nodes'
|
5
|
+
|
6
|
+
module Slaw
|
7
|
+
module Grammars
|
8
|
+
grammar Tables
|
9
|
+
##########
|
10
|
+
# wikimedia-style tables
|
11
|
+
#
|
12
|
+
# this grammar doesn't support inline table cells (eg: | col1 || col2 || col3)
|
13
|
+
# instead, the builder preprocesses tables to break inline cells onto their own
|
14
|
+
# lines, which we do support.
|
15
|
+
|
16
|
+
rule table
|
17
|
+
space? '{|' eol
|
18
|
+
table_body
|
19
|
+
'|}' eol
|
20
|
+
<Table>
|
21
|
+
end
|
22
|
+
|
23
|
+
rule table_body
|
24
|
+
(table_row / table_cell)*
|
25
|
+
end
|
26
|
+
|
27
|
+
rule table_row
|
28
|
+
'|-' space? eol
|
29
|
+
end
|
30
|
+
|
31
|
+
rule table_cell
|
32
|
+
# don't match end-of-table
|
33
|
+
!'|}'
|
34
|
+
[!|] attribs:table_attribs? space?
|
35
|
+
# first content line, then multiple lines
|
36
|
+
content:(line:table_line (![!|] space? line:table_line)*)
|
37
|
+
<TableCell>
|
38
|
+
end
|
39
|
+
|
40
|
+
rule table_line
|
41
|
+
clauses:clauses? eol
|
42
|
+
<TableLine>
|
43
|
+
end
|
44
|
+
|
45
|
+
rule table_attribs
|
46
|
+
space? attribs:(table_attrib+) '|'
|
47
|
+
end
|
48
|
+
|
49
|
+
rule table_attrib
|
50
|
+
name:([a-z_-]+) '=' value:(
|
51
|
+
('"' (!'"' .)* '"') /
|
52
|
+
("'" (!"'" .)* "'"))
|
53
|
+
space?
|
54
|
+
end
|
55
|
+
|
56
|
+
include Terminals
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module Slaw
|
2
|
+
module Grammars
|
3
|
+
module Tables
|
4
|
+
class Table < Treetop::Runtime::SyntaxNode
|
5
|
+
def to_xml(b, idprefix, i=0)
|
6
|
+
b.table(id: "#{idprefix}table#{i}") { |b|
|
7
|
+
# we'll gather cells into this row list
|
8
|
+
rows = []
|
9
|
+
cells = []
|
10
|
+
|
11
|
+
for child in table_body.elements
|
12
|
+
if child.is_a? TableCell
|
13
|
+
# cell
|
14
|
+
cells << child
|
15
|
+
else
|
16
|
+
# new row marker
|
17
|
+
rows << cells unless cells.empty?
|
18
|
+
cells = []
|
19
|
+
end
|
20
|
+
end
|
21
|
+
rows << cells unless cells.empty?
|
22
|
+
|
23
|
+
for row in rows
|
24
|
+
b.tr { |tr|
|
25
|
+
for cell in row
|
26
|
+
cell.to_xml(tr, "")
|
27
|
+
end
|
28
|
+
}
|
29
|
+
end
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class TableCell < Treetop::Runtime::SyntaxNode
|
35
|
+
def to_xml(b, idprefix)
|
36
|
+
tag = text_value[0] == '!' ? 'th' : 'td'
|
37
|
+
|
38
|
+
attrs = {}
|
39
|
+
if not attribs.empty?
|
40
|
+
for item in attribs.attribs.elements
|
41
|
+
# key=value (strip quotes around value)
|
42
|
+
attrs[item.name.text_value.strip] = item.value.text_value[1..-2]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
b.send(tag.to_sym, attrs) { |b|
|
47
|
+
b.p { |b|
|
48
|
+
# first line, and the rest
|
49
|
+
lines = [content.line] + content.elements.last.elements.map(&:line)
|
50
|
+
|
51
|
+
lines.each_with_index do |line, i|
|
52
|
+
line.to_xml(b, i, i == lines.length-1)
|
53
|
+
end
|
54
|
+
}
|
55
|
+
}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
class TableLine < Treetop::Runtime::SyntaxNode
|
60
|
+
# line of table content
|
61
|
+
def to_xml(b, i, tail)
|
62
|
+
clauses.to_xml(b) unless clauses.empty?
|
63
|
+
|
64
|
+
# add trailing newlines.
|
65
|
+
# for the first line, eat whitespace at the start
|
66
|
+
# for the last line, eat whitespace at the end
|
67
|
+
if not tail and (i > 0 or not clauses.empty?)
|
68
|
+
eol.text_value.count("\n").times { b.eol }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Slaw
|
4
|
+
module Grammars
|
5
|
+
grammar Terminals
|
6
|
+
#########
|
7
|
+
## one line of basic content
|
8
|
+
|
9
|
+
rule content
|
10
|
+
# anything but a newline, followed by a
|
11
|
+
# newline or end of file (without consuming the newline)
|
12
|
+
[^\n]+ &eol
|
13
|
+
end
|
14
|
+
|
15
|
+
##########
|
16
|
+
# terminals
|
17
|
+
|
18
|
+
# eg. 2, 2A, 2b
|
19
|
+
rule number_letter
|
20
|
+
number letter*
|
21
|
+
end
|
22
|
+
|
23
|
+
rule letter_ordinal
|
24
|
+
letter (letter / digit)*
|
25
|
+
end
|
26
|
+
|
27
|
+
rule dotted_number_3
|
28
|
+
number '.' number ('.' number)+
|
29
|
+
end
|
30
|
+
|
31
|
+
rule dotted_number_2
|
32
|
+
number '.' number
|
33
|
+
end
|
34
|
+
|
35
|
+
rule number
|
36
|
+
digit+
|
37
|
+
end
|
38
|
+
|
39
|
+
rule digit
|
40
|
+
[0-9]
|
41
|
+
end
|
42
|
+
|
43
|
+
rule letter
|
44
|
+
[a-zA-Z]
|
45
|
+
end
|
46
|
+
|
47
|
+
rule alphanums
|
48
|
+
[a-zA-Z0-9]+
|
49
|
+
end
|
50
|
+
|
51
|
+
rule quotes
|
52
|
+
["“”]
|
53
|
+
end
|
54
|
+
|
55
|
+
rule non_quotes
|
56
|
+
[^"“”]
|
57
|
+
end
|
58
|
+
|
59
|
+
##########
|
60
|
+
# whitespace
|
61
|
+
|
62
|
+
rule space
|
63
|
+
[ \t]+
|
64
|
+
end
|
65
|
+
|
66
|
+
rule whitespace
|
67
|
+
[ \t\n]*
|
68
|
+
end
|
69
|
+
|
70
|
+
rule empty_line
|
71
|
+
space? newline
|
72
|
+
end
|
73
|
+
|
74
|
+
rule eol
|
75
|
+
newline
|
76
|
+
empty_line*
|
77
|
+
end
|
78
|
+
|
79
|
+
rule newline
|
80
|
+
"\n"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,222 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'slaw/parse/grammar_helpers'
|
4
|
+
require 'slaw/grammars/za/act_nodes'
|
5
|
+
|
6
|
+
require 'slaw/grammars/terminals'
|
7
|
+
require 'slaw/grammars/tables'
|
8
|
+
require 'slaw/grammars/schedules'
|
9
|
+
require 'slaw/grammars/inlines'
|
10
|
+
|
11
|
+
module Slaw
|
12
|
+
module Grammars
|
13
|
+
module ZA
|
14
|
+
grammar Act
|
15
|
+
include Slaw::Parse::GrammarHelpers
|
16
|
+
|
17
|
+
########
|
18
|
+
# major containers
|
19
|
+
|
20
|
+
rule act
|
21
|
+
empty_line*
|
22
|
+
preface:preface?
|
23
|
+
preamble:preamble?
|
24
|
+
body
|
25
|
+
schedules:schedules_container? <Act>
|
26
|
+
end
|
27
|
+
|
28
|
+
rule preface
|
29
|
+
!'PREAMBLE'
|
30
|
+
('PREFACE'i space? eol)?
|
31
|
+
statements:(!'PREAMBLE' pre_body_statement)* <Preface>
|
32
|
+
end
|
33
|
+
|
34
|
+
rule preamble
|
35
|
+
'PREAMBLE'i space? eol
|
36
|
+
statements:pre_body_statement* <Preamble>
|
37
|
+
end
|
38
|
+
|
39
|
+
rule body
|
40
|
+
children:(chapter / part / section / subsection / block_paragraphs)+ <Body>
|
41
|
+
end
|
42
|
+
|
43
|
+
rule chapter
|
44
|
+
heading:chapter_heading
|
45
|
+
children:(part / section / subsection / block_paragraphs)*
|
46
|
+
<Chapter>
|
47
|
+
end
|
48
|
+
|
49
|
+
rule part
|
50
|
+
heading:part_heading
|
51
|
+
children:(section / subsection / block_paragraphs)*
|
52
|
+
<Part>
|
53
|
+
end
|
54
|
+
|
55
|
+
rule section
|
56
|
+
section_title
|
57
|
+
children:(subsection / block_paragraphs)* <Section>
|
58
|
+
end
|
59
|
+
|
60
|
+
rule subsection
|
61
|
+
space? subsection_prefix space?
|
62
|
+
# eg: (2) (a) foo
|
63
|
+
first_child:inline_block_element?
|
64
|
+
# eg: (2)
|
65
|
+
eol?
|
66
|
+
children:block_element* <Subsection>
|
67
|
+
end
|
68
|
+
|
69
|
+
##########
|
70
|
+
# group elements
|
71
|
+
#
|
72
|
+
# these are used externally and provide support when parsing just
|
73
|
+
# a particular portion of a document
|
74
|
+
|
75
|
+
rule chapters
|
76
|
+
children:chapter+ <GroupNode>
|
77
|
+
end
|
78
|
+
|
79
|
+
rule parts
|
80
|
+
children:part+ <GroupNode>
|
81
|
+
end
|
82
|
+
|
83
|
+
rule sections
|
84
|
+
children:section+ <GroupNode>
|
85
|
+
end
|
86
|
+
|
87
|
+
##########
|
88
|
+
# headings
|
89
|
+
|
90
|
+
rule chapter_heading
|
91
|
+
space? chapter_heading_prefix heading:(newline? content)? eol
|
92
|
+
<ChapterHeading>
|
93
|
+
end
|
94
|
+
|
95
|
+
rule part_heading
|
96
|
+
space? part_heading_prefix heading:(newline? content)? eol
|
97
|
+
<PartHeading>
|
98
|
+
end
|
99
|
+
|
100
|
+
rule section_title
|
101
|
+
section_title_1 / section_1_title
|
102
|
+
end
|
103
|
+
|
104
|
+
rule section_title_1
|
105
|
+
&{ |s| options[:section_number_after_title] }
|
106
|
+
# Section title
|
107
|
+
# 1. Section content
|
108
|
+
content eol
|
109
|
+
section_title_prefix whitespace <SectionTitleType1>
|
110
|
+
end
|
111
|
+
|
112
|
+
rule section_1_title
|
113
|
+
# 1. Section title
|
114
|
+
# Section content
|
115
|
+
#
|
116
|
+
# Additionally, the section title is optional.
|
117
|
+
!{ |s| options[:section_number_after_title] }
|
118
|
+
space? section_title_prefix section_title:section_title_content? eol?
|
119
|
+
<SectionTitleType2>
|
120
|
+
end
|
121
|
+
|
122
|
+
rule section_title_content
|
123
|
+
# don't match subsections, eg.
|
124
|
+
#
|
125
|
+
# 10. (1) subsection content...
|
126
|
+
space !subsection_prefix content eol
|
127
|
+
end
|
128
|
+
|
129
|
+
##########
|
130
|
+
# blocks of content inside containers
|
131
|
+
|
132
|
+
rule block_paragraphs
|
133
|
+
block_element+ <BlockParagraph>
|
134
|
+
end
|
135
|
+
|
136
|
+
rule block_element
|
137
|
+
(table / blocklist / naked_statement)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Block elements that don't have to appear at the start of a line.
|
141
|
+
# ie. we don't need to guard against the start of a chapter, section, etc.
|
142
|
+
rule inline_block_element
|
143
|
+
(table / blocklist / inline_statement)
|
144
|
+
end
|
145
|
+
|
146
|
+
rule blocklist
|
147
|
+
blocklist_item+ <Blocklist>
|
148
|
+
end
|
149
|
+
|
150
|
+
rule blocklist_item
|
151
|
+
# TODO: this whitespace should probably be space, to allow empty blocklist items followed by plain text
|
152
|
+
space? blocklist_item_prefix whitespace item_content:(!blocklist_item_prefix clauses:clauses? eol)? eol?
|
153
|
+
<BlocklistItem>
|
154
|
+
end
|
155
|
+
|
156
|
+
rule blocklist_item_prefix
|
157
|
+
('(' letter_ordinal ')') / dotted_number_3
|
158
|
+
end
|
159
|
+
|
160
|
+
##########
|
161
|
+
# statements - single lines of content
|
162
|
+
#
|
163
|
+
# If a statement starts with a backslash, it's considered to have escaped the subsequent word,
|
164
|
+
# and is ignored. This allows escaping of section headings, etc.
|
165
|
+
|
166
|
+
rule naked_statement
|
167
|
+
space? !(chapter_heading / part_heading / section_title / schedule_title / subsection_prefix) '\\'? clauses eol
|
168
|
+
<NakedStatement>
|
169
|
+
end
|
170
|
+
|
171
|
+
rule pre_body_statement
|
172
|
+
space? !(chapter_heading / part_heading / section_title / schedule_title) '\\'? clauses eol
|
173
|
+
<NakedStatement>
|
174
|
+
end
|
175
|
+
|
176
|
+
##########
|
177
|
+
# prefixes
|
178
|
+
|
179
|
+
rule part_heading_prefix
|
180
|
+
'part'i space alphanums [ :-]*
|
181
|
+
end
|
182
|
+
|
183
|
+
rule chapter_heading_prefix
|
184
|
+
'chapter'i space alphanums [ :-]*
|
185
|
+
end
|
186
|
+
|
187
|
+
rule section_title_prefix
|
188
|
+
number_letter '.'?
|
189
|
+
end
|
190
|
+
|
191
|
+
rule subsection_prefix
|
192
|
+
# there are two subsection handling syntaxes:
|
193
|
+
#
|
194
|
+
# (1) foo
|
195
|
+
# (2A) foo
|
196
|
+
#
|
197
|
+
# and
|
198
|
+
#
|
199
|
+
# 8.2 for
|
200
|
+
# 8.3 bar
|
201
|
+
#
|
202
|
+
# The second is less common, but this allows us to handle it.
|
203
|
+
# Note that it is usually accompanied by a similar list number format:
|
204
|
+
#
|
205
|
+
# 8.2.1 item 1
|
206
|
+
# 8.2.2 item 2
|
207
|
+
#
|
208
|
+
# which aren't subsections, but lists, so force the space at the end
|
209
|
+
# of the number to catch this case.
|
210
|
+
num:('(' number_letter ')')
|
211
|
+
/
|
212
|
+
num:dotted_number_2 '.'? space
|
213
|
+
end
|
214
|
+
|
215
|
+
include Slaw::Grammars::Inlines
|
216
|
+
include Slaw::Grammars::Tables
|
217
|
+
include Slaw::Grammars::Schedules
|
218
|
+
include Slaw::Grammars::Terminals
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|