slaw 0.17.2 → 1.0.0.alpha.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +9 -2
- data/bin/slaw +2 -19
- data/lib/slaw/generator.rb +21 -6
- data/lib/slaw/grammars/core_nodes.rb +17 -0
- data/lib/slaw/grammars/inlines.treetop +45 -0
- data/lib/slaw/grammars/inlines_nodes.rb +58 -0
- data/lib/slaw/grammars/pl/act.treetop +246 -0
- data/lib/slaw/grammars/pl/act_nodes.rb +469 -0
- data/lib/slaw/grammars/schedules.treetop +33 -0
- data/lib/slaw/grammars/schedules_nodes.rb +107 -0
- data/lib/slaw/grammars/tables.treetop +59 -0
- data/lib/slaw/grammars/tables_nodes.rb +74 -0
- data/lib/slaw/grammars/terminals.treetop +84 -0
- data/lib/slaw/grammars/za/act.treetop +222 -0
- data/lib/slaw/grammars/za/act_nodes.rb +307 -0
- data/lib/slaw/{za → grammars/za}/act_text.xsl +0 -0
- data/lib/slaw/parse/builder.rb +6 -202
- data/lib/slaw/version.rb +1 -1
- data/spec/generator_spec.rb +2 -0
- data/spec/parse/builder_spec.rb +0 -48
- data/spec/pl/act_block_spec.rb +449 -0
- data/spec/za/act_block_spec.rb +5 -3
- data/spec/za/act_inline_spec.rb +2 -0
- data/spec/za/act_schedules_spec.rb +2 -0
- data/spec/za/act_table_spec.rb +2 -0
- metadata +19 -7
- data/lib/slaw/za/act.treetop +0 -393
- data/lib/slaw/za/act_nodes.rb +0 -532
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'slaw/grammars/schedules_nodes'
|
4
|
+
|
5
|
+
module Slaw
|
6
|
+
module Grammars
|
7
|
+
grammar Schedules
|
8
|
+
rule schedules_container
|
9
|
+
schedules:schedules <ScheduleContainer>
|
10
|
+
end
|
11
|
+
|
12
|
+
rule schedules
|
13
|
+
children:schedule+ <GroupNode>
|
14
|
+
end
|
15
|
+
|
16
|
+
rule schedule
|
17
|
+
schedule_title
|
18
|
+
body:body?
|
19
|
+
<Schedule>
|
20
|
+
end
|
21
|
+
|
22
|
+
rule schedule_title
|
23
|
+
space? schedule_title_prefix space? "\""? num:alphanums? "\""? [ \t:.-]* title:(content)?
|
24
|
+
heading:(newline space? content)?
|
25
|
+
eol
|
26
|
+
end
|
27
|
+
|
28
|
+
rule schedule_title_prefix
|
29
|
+
'schedule'i 's'i?
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'slaw/grammars/core_nodes'
|
2
|
+
|
3
|
+
module Slaw
|
4
|
+
module Grammars
|
5
|
+
module Schedules
|
6
|
+
FRBR_URI = '/za/act/1980/01'
|
7
|
+
WORK_URI = FRBR_URI
|
8
|
+
EXPRESSION_URI = "#{FRBR_URI}/eng@"
|
9
|
+
MANIFESTATION_URI = EXPRESSION_URI
|
10
|
+
|
11
|
+
class ScheduleContainer < Treetop::Runtime::SyntaxNode
|
12
|
+
def to_xml(b)
|
13
|
+
b.components { |b|
|
14
|
+
schedules.children.elements.each_with_index { |e, i|
|
15
|
+
e.to_xml(b, "", i+1)
|
16
|
+
}
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Schedule < Treetop::Runtime::SyntaxNode
|
22
|
+
def num
|
23
|
+
n = schedule_title.num.text_value
|
24
|
+
return (n && !n.empty?) ? n : nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def alias
|
28
|
+
if not schedule_title.title.text_value.blank?
|
29
|
+
schedule_title.title.text_value
|
30
|
+
elsif num
|
31
|
+
"Schedule #{num}"
|
32
|
+
else
|
33
|
+
"Schedule"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def heading
|
38
|
+
if schedule_title.heading.respond_to? :content
|
39
|
+
schedule_title.heading.content.text_value
|
40
|
+
else
|
41
|
+
nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_xml(b, idprefix=nil, i=1)
|
46
|
+
if num
|
47
|
+
n = num
|
48
|
+
component = "schedule#{n}"
|
49
|
+
else
|
50
|
+
n = i
|
51
|
+
# make a component name from the schedule title
|
52
|
+
component = self.alias.downcase().strip().gsub(/[^a-z0-9]/i, '').gsub(/ +/, '')
|
53
|
+
end
|
54
|
+
|
55
|
+
id = "#{idprefix}#{component}"
|
56
|
+
|
57
|
+
b.component(id: "component-#{id}") { |b|
|
58
|
+
b.doc_(name: component) { |b|
|
59
|
+
b.meta { |b|
|
60
|
+
b.identification(source: "#slaw") { |b|
|
61
|
+
b.FRBRWork { |b|
|
62
|
+
b.FRBRthis(value: "#{WORK_URI}/#{component}")
|
63
|
+
b.FRBRuri(value: WORK_URI)
|
64
|
+
b.FRBRalias(value: self.alias)
|
65
|
+
b.FRBRdate(date: '1980-01-01', name: 'Generation')
|
66
|
+
b.FRBRauthor(href: '#council')
|
67
|
+
b.FRBRcountry(value: 'za')
|
68
|
+
}
|
69
|
+
b.FRBRExpression { |b|
|
70
|
+
b.FRBRthis(value: "#{EXPRESSION_URI}/#{component}")
|
71
|
+
b.FRBRuri(value: EXPRESSION_URI)
|
72
|
+
b.FRBRdate(date: '1980-01-01', name: 'Generation')
|
73
|
+
b.FRBRauthor(href: '#council')
|
74
|
+
b.FRBRlanguage(language: 'eng')
|
75
|
+
}
|
76
|
+
b.FRBRManifestation { |b|
|
77
|
+
b.FRBRthis(value: "#{MANIFESTATION_URI}/#{component}")
|
78
|
+
b.FRBRuri(value: MANIFESTATION_URI)
|
79
|
+
b.FRBRdate(date: Time.now.strftime('%Y-%m-%d'), name: 'Generation')
|
80
|
+
b.FRBRauthor(href: '#slaw')
|
81
|
+
}
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
b.mainBody { |b|
|
86
|
+
idprefix = "#{id}."
|
87
|
+
|
88
|
+
# there is no good AKN hierarchy container for schedules, so we
|
89
|
+
# just use article because we don't use it anywhere else.
|
90
|
+
b.article(id: id) { |b|
|
91
|
+
b.heading(heading) if heading
|
92
|
+
body.children.elements.each_with_index { |e| e.to_xml(b, idprefix, i) } if body.is_a? Body
|
93
|
+
}
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class ScheduleStatement < Treetop::Runtime::SyntaxNode
|
101
|
+
def to_xml(b, idprefix)
|
102
|
+
b.p { |b| clauses.to_xml(b, idprefix) } if clauses
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'slaw/grammars/terminals'
|
4
|
+
require 'slaw/grammars/tables_nodes'
|
5
|
+
|
6
|
+
module Slaw
|
7
|
+
module Grammars
|
8
|
+
grammar Tables
|
9
|
+
##########
|
10
|
+
# wikimedia-style tables
|
11
|
+
#
|
12
|
+
# this grammar doesn't support inline table cells (eg: | col1 || col2 || col3)
|
13
|
+
# instead, the builder preprocesses tables to break inline cells onto their own
|
14
|
+
# lines, which we do support.
|
15
|
+
|
16
|
+
rule table
|
17
|
+
space? '{|' eol
|
18
|
+
table_body
|
19
|
+
'|}' eol
|
20
|
+
<Table>
|
21
|
+
end
|
22
|
+
|
23
|
+
rule table_body
|
24
|
+
(table_row / table_cell)*
|
25
|
+
end
|
26
|
+
|
27
|
+
rule table_row
|
28
|
+
'|-' space? eol
|
29
|
+
end
|
30
|
+
|
31
|
+
rule table_cell
|
32
|
+
# don't match end-of-table
|
33
|
+
!'|}'
|
34
|
+
[!|] attribs:table_attribs? space?
|
35
|
+
# first content line, then multiple lines
|
36
|
+
content:(line:table_line (![!|] space? line:table_line)*)
|
37
|
+
<TableCell>
|
38
|
+
end
|
39
|
+
|
40
|
+
rule table_line
|
41
|
+
clauses:clauses? eol
|
42
|
+
<TableLine>
|
43
|
+
end
|
44
|
+
|
45
|
+
rule table_attribs
|
46
|
+
space? attribs:(table_attrib+) '|'
|
47
|
+
end
|
48
|
+
|
49
|
+
rule table_attrib
|
50
|
+
name:([a-z_-]+) '=' value:(
|
51
|
+
('"' (!'"' .)* '"') /
|
52
|
+
("'" (!"'" .)* "'"))
|
53
|
+
space?
|
54
|
+
end
|
55
|
+
|
56
|
+
include Terminals
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module Slaw
|
2
|
+
module Grammars
|
3
|
+
module Tables
|
4
|
+
class Table < Treetop::Runtime::SyntaxNode
|
5
|
+
def to_xml(b, idprefix, i=0)
|
6
|
+
b.table(id: "#{idprefix}table#{i}") { |b|
|
7
|
+
# we'll gather cells into this row list
|
8
|
+
rows = []
|
9
|
+
cells = []
|
10
|
+
|
11
|
+
for child in table_body.elements
|
12
|
+
if child.is_a? TableCell
|
13
|
+
# cell
|
14
|
+
cells << child
|
15
|
+
else
|
16
|
+
# new row marker
|
17
|
+
rows << cells unless cells.empty?
|
18
|
+
cells = []
|
19
|
+
end
|
20
|
+
end
|
21
|
+
rows << cells unless cells.empty?
|
22
|
+
|
23
|
+
for row in rows
|
24
|
+
b.tr { |tr|
|
25
|
+
for cell in row
|
26
|
+
cell.to_xml(tr, "")
|
27
|
+
end
|
28
|
+
}
|
29
|
+
end
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class TableCell < Treetop::Runtime::SyntaxNode
|
35
|
+
def to_xml(b, idprefix)
|
36
|
+
tag = text_value[0] == '!' ? 'th' : 'td'
|
37
|
+
|
38
|
+
attrs = {}
|
39
|
+
if not attribs.empty?
|
40
|
+
for item in attribs.attribs.elements
|
41
|
+
# key=value (strip quotes around value)
|
42
|
+
attrs[item.name.text_value.strip] = item.value.text_value[1..-2]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
b.send(tag.to_sym, attrs) { |b|
|
47
|
+
b.p { |b|
|
48
|
+
# first line, and the rest
|
49
|
+
lines = [content.line] + content.elements.last.elements.map(&:line)
|
50
|
+
|
51
|
+
lines.each_with_index do |line, i|
|
52
|
+
line.to_xml(b, i, i == lines.length-1)
|
53
|
+
end
|
54
|
+
}
|
55
|
+
}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
class TableLine < Treetop::Runtime::SyntaxNode
|
60
|
+
# line of table content
|
61
|
+
def to_xml(b, i, tail)
|
62
|
+
clauses.to_xml(b) unless clauses.empty?
|
63
|
+
|
64
|
+
# add trailing newlines.
|
65
|
+
# for the first line, eat whitespace at the start
|
66
|
+
# for the last line, eat whitespace at the end
|
67
|
+
if not tail and (i > 0 or not clauses.empty?)
|
68
|
+
eol.text_value.count("\n").times { b.eol }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Slaw
|
4
|
+
module Grammars
|
5
|
+
grammar Terminals
|
6
|
+
#########
|
7
|
+
## one line of basic content
|
8
|
+
|
9
|
+
rule content
|
10
|
+
# anything but a newline, followed by a
|
11
|
+
# newline or end of file (without consuming the newline)
|
12
|
+
[^\n]+ &eol
|
13
|
+
end
|
14
|
+
|
15
|
+
##########
|
16
|
+
# terminals
|
17
|
+
|
18
|
+
# eg. 2, 2A, 2b
|
19
|
+
rule number_letter
|
20
|
+
number letter*
|
21
|
+
end
|
22
|
+
|
23
|
+
rule letter_ordinal
|
24
|
+
letter (letter / digit)*
|
25
|
+
end
|
26
|
+
|
27
|
+
rule dotted_number_3
|
28
|
+
number '.' number ('.' number)+
|
29
|
+
end
|
30
|
+
|
31
|
+
rule dotted_number_2
|
32
|
+
number '.' number
|
33
|
+
end
|
34
|
+
|
35
|
+
rule number
|
36
|
+
digit+
|
37
|
+
end
|
38
|
+
|
39
|
+
rule digit
|
40
|
+
[0-9]
|
41
|
+
end
|
42
|
+
|
43
|
+
rule letter
|
44
|
+
[a-zA-Z]
|
45
|
+
end
|
46
|
+
|
47
|
+
rule alphanums
|
48
|
+
[a-zA-Z0-9]+
|
49
|
+
end
|
50
|
+
|
51
|
+
rule quotes
|
52
|
+
["“”]
|
53
|
+
end
|
54
|
+
|
55
|
+
rule non_quotes
|
56
|
+
[^"“”]
|
57
|
+
end
|
58
|
+
|
59
|
+
##########
|
60
|
+
# whitespace
|
61
|
+
|
62
|
+
rule space
|
63
|
+
[ \t]+
|
64
|
+
end
|
65
|
+
|
66
|
+
rule whitespace
|
67
|
+
[ \t\n]*
|
68
|
+
end
|
69
|
+
|
70
|
+
rule empty_line
|
71
|
+
space? newline
|
72
|
+
end
|
73
|
+
|
74
|
+
rule eol
|
75
|
+
newline
|
76
|
+
empty_line*
|
77
|
+
end
|
78
|
+
|
79
|
+
rule newline
|
80
|
+
"\n"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,222 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'slaw/parse/grammar_helpers'
|
4
|
+
require 'slaw/grammars/za/act_nodes'
|
5
|
+
|
6
|
+
require 'slaw/grammars/terminals'
|
7
|
+
require 'slaw/grammars/tables'
|
8
|
+
require 'slaw/grammars/schedules'
|
9
|
+
require 'slaw/grammars/inlines'
|
10
|
+
|
11
|
+
module Slaw
|
12
|
+
module Grammars
|
13
|
+
module ZA
|
14
|
+
grammar Act
|
15
|
+
include Slaw::Parse::GrammarHelpers
|
16
|
+
|
17
|
+
########
|
18
|
+
# major containers
|
19
|
+
|
20
|
+
rule act
|
21
|
+
empty_line*
|
22
|
+
preface:preface?
|
23
|
+
preamble:preamble?
|
24
|
+
body
|
25
|
+
schedules:schedules_container? <Act>
|
26
|
+
end
|
27
|
+
|
28
|
+
rule preface
|
29
|
+
!'PREAMBLE'
|
30
|
+
('PREFACE'i space? eol)?
|
31
|
+
statements:(!'PREAMBLE' pre_body_statement)* <Preface>
|
32
|
+
end
|
33
|
+
|
34
|
+
rule preamble
|
35
|
+
'PREAMBLE'i space? eol
|
36
|
+
statements:pre_body_statement* <Preamble>
|
37
|
+
end
|
38
|
+
|
39
|
+
rule body
|
40
|
+
children:(chapter / part / section / subsection / block_paragraphs)+ <Body>
|
41
|
+
end
|
42
|
+
|
43
|
+
rule chapter
|
44
|
+
heading:chapter_heading
|
45
|
+
children:(part / section / subsection / block_paragraphs)*
|
46
|
+
<Chapter>
|
47
|
+
end
|
48
|
+
|
49
|
+
rule part
|
50
|
+
heading:part_heading
|
51
|
+
children:(section / subsection / block_paragraphs)*
|
52
|
+
<Part>
|
53
|
+
end
|
54
|
+
|
55
|
+
rule section
|
56
|
+
section_title
|
57
|
+
children:(subsection / block_paragraphs)* <Section>
|
58
|
+
end
|
59
|
+
|
60
|
+
rule subsection
|
61
|
+
space? subsection_prefix space?
|
62
|
+
# eg: (2) (a) foo
|
63
|
+
first_child:inline_block_element?
|
64
|
+
# eg: (2)
|
65
|
+
eol?
|
66
|
+
children:block_element* <Subsection>
|
67
|
+
end
|
68
|
+
|
69
|
+
##########
|
70
|
+
# group elements
|
71
|
+
#
|
72
|
+
# these are used externally and provide support when parsing just
|
73
|
+
# a particular portion of a document
|
74
|
+
|
75
|
+
rule chapters
|
76
|
+
children:chapter+ <GroupNode>
|
77
|
+
end
|
78
|
+
|
79
|
+
rule parts
|
80
|
+
children:part+ <GroupNode>
|
81
|
+
end
|
82
|
+
|
83
|
+
rule sections
|
84
|
+
children:section+ <GroupNode>
|
85
|
+
end
|
86
|
+
|
87
|
+
##########
|
88
|
+
# headings
|
89
|
+
|
90
|
+
rule chapter_heading
|
91
|
+
space? chapter_heading_prefix heading:(newline? content)? eol
|
92
|
+
<ChapterHeading>
|
93
|
+
end
|
94
|
+
|
95
|
+
rule part_heading
|
96
|
+
space? part_heading_prefix heading:(newline? content)? eol
|
97
|
+
<PartHeading>
|
98
|
+
end
|
99
|
+
|
100
|
+
rule section_title
|
101
|
+
section_title_1 / section_1_title
|
102
|
+
end
|
103
|
+
|
104
|
+
rule section_title_1
|
105
|
+
&{ |s| options[:section_number_after_title] }
|
106
|
+
# Section title
|
107
|
+
# 1. Section content
|
108
|
+
content eol
|
109
|
+
section_title_prefix whitespace <SectionTitleType1>
|
110
|
+
end
|
111
|
+
|
112
|
+
rule section_1_title
|
113
|
+
# 1. Section title
|
114
|
+
# Section content
|
115
|
+
#
|
116
|
+
# Additionally, the section title is optional.
|
117
|
+
!{ |s| options[:section_number_after_title] }
|
118
|
+
space? section_title_prefix section_title:section_title_content? eol?
|
119
|
+
<SectionTitleType2>
|
120
|
+
end
|
121
|
+
|
122
|
+
rule section_title_content
|
123
|
+
# don't match subsections, eg.
|
124
|
+
#
|
125
|
+
# 10. (1) subsection content...
|
126
|
+
space !subsection_prefix content eol
|
127
|
+
end
|
128
|
+
|
129
|
+
##########
|
130
|
+
# blocks of content inside containers
|
131
|
+
|
132
|
+
rule block_paragraphs
|
133
|
+
block_element+ <BlockParagraph>
|
134
|
+
end
|
135
|
+
|
136
|
+
rule block_element
|
137
|
+
(table / blocklist / naked_statement)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Block elements that don't have to appear at the start of a line.
|
141
|
+
# ie. we don't need to guard against the start of a chapter, section, etc.
|
142
|
+
rule inline_block_element
|
143
|
+
(table / blocklist / inline_statement)
|
144
|
+
end
|
145
|
+
|
146
|
+
rule blocklist
|
147
|
+
blocklist_item+ <Blocklist>
|
148
|
+
end
|
149
|
+
|
150
|
+
rule blocklist_item
|
151
|
+
# TODO: this whitespace should probably be space, to allow empty blocklist items followed by plain text
|
152
|
+
space? blocklist_item_prefix whitespace item_content:(!blocklist_item_prefix clauses:clauses? eol)? eol?
|
153
|
+
<BlocklistItem>
|
154
|
+
end
|
155
|
+
|
156
|
+
rule blocklist_item_prefix
|
157
|
+
('(' letter_ordinal ')') / dotted_number_3
|
158
|
+
end
|
159
|
+
|
160
|
+
##########
|
161
|
+
# statements - single lines of content
|
162
|
+
#
|
163
|
+
# If a statement starts with a backslash, it's considered to have escaped the subsequent word,
|
164
|
+
# and is ignored. This allows escaping of section headings, etc.
|
165
|
+
|
166
|
+
rule naked_statement
|
167
|
+
space? !(chapter_heading / part_heading / section_title / schedule_title / subsection_prefix) '\\'? clauses eol
|
168
|
+
<NakedStatement>
|
169
|
+
end
|
170
|
+
|
171
|
+
rule pre_body_statement
|
172
|
+
space? !(chapter_heading / part_heading / section_title / schedule_title) '\\'? clauses eol
|
173
|
+
<NakedStatement>
|
174
|
+
end
|
175
|
+
|
176
|
+
##########
|
177
|
+
# prefixes
|
178
|
+
|
179
|
+
rule part_heading_prefix
|
180
|
+
'part'i space alphanums [ :-]*
|
181
|
+
end
|
182
|
+
|
183
|
+
rule chapter_heading_prefix
|
184
|
+
'chapter'i space alphanums [ :-]*
|
185
|
+
end
|
186
|
+
|
187
|
+
rule section_title_prefix
|
188
|
+
number_letter '.'?
|
189
|
+
end
|
190
|
+
|
191
|
+
rule subsection_prefix
|
192
|
+
# there are two subsection handling syntaxes:
|
193
|
+
#
|
194
|
+
# (1) foo
|
195
|
+
# (2A) foo
|
196
|
+
#
|
197
|
+
# and
|
198
|
+
#
|
199
|
+
# 8.2 for
|
200
|
+
# 8.3 bar
|
201
|
+
#
|
202
|
+
# The second is less common, but this allows us to handle it.
|
203
|
+
# Note that it is usually accompanied by a similar list number format:
|
204
|
+
#
|
205
|
+
# 8.2.1 item 1
|
206
|
+
# 8.2.2 item 2
|
207
|
+
#
|
208
|
+
# which aren't subsections, but lists, so force the space at the end
|
209
|
+
# of the number to catch this case.
|
210
|
+
num:('(' number_letter ')')
|
211
|
+
/
|
212
|
+
num:dotted_number_2 '.'? space
|
213
|
+
end
|
214
|
+
|
215
|
+
include Slaw::Grammars::Inlines
|
216
|
+
include Slaw::Grammars::Tables
|
217
|
+
include Slaw::Grammars::Schedules
|
218
|
+
include Slaw::Grammars::Terminals
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|