relaton-oasis 1.10.0

Sign up to get free protection for your applications and to get access to all the features.
data/grammars/reqt.rng ADDED
@@ -0,0 +1,223 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
3
+ <!--
4
+ Presupposes isodoc.rnc, is included in it
5
+ include "isodoc.rnc" { }
6
+ -->
7
+ <define name="requirement">
8
+ <element name="requirement">
9
+ <ref name="RequirementType"/>
10
+ </element>
11
+ </define>
12
+ <define name="recommendation">
13
+ <element name="recommendation">
14
+ <ref name="RequirementType"/>
15
+ </element>
16
+ </define>
17
+ <define name="permission">
18
+ <element name="permission">
19
+ <ref name="RequirementType"/>
20
+ </element>
21
+ </define>
22
+ <define name="RequirementType">
23
+ <optional>
24
+ <attribute name="obligation">
25
+ <ref name="ObligationType"/>
26
+ </attribute>
27
+ </optional>
28
+ <optional>
29
+ <attribute name="unnumbered">
30
+ <data type="boolean"/>
31
+ </attribute>
32
+ </optional>
33
+ <optional>
34
+ <attribute name="number"/>
35
+ </optional>
36
+ <optional>
37
+ <attribute name="subsequence"/>
38
+ </optional>
39
+ <optional>
40
+ <attribute name="keep-with-next">
41
+ <data type="boolean"/>
42
+ </attribute>
43
+ </optional>
44
+ <optional>
45
+ <attribute name="keep-lines-together">
46
+ <data type="boolean"/>
47
+ </attribute>
48
+ </optional>
49
+ <attribute name="id">
50
+ <data type="ID"/>
51
+ </attribute>
52
+ <optional>
53
+ <attribute name="filename"/>
54
+ </optional>
55
+ <optional>
56
+ <attribute name="model"/>
57
+ </optional>
58
+ <optional>
59
+ <attribute name="type"/>
60
+ </optional>
61
+ <optional>
62
+ <attribute name="tag"/>
63
+ </optional>
64
+ <optional>
65
+ <attribute name="multilingual-rendering">
66
+ <ref name="MultilingualRenderingType"/>
67
+ </attribute>
68
+ </optional>
69
+ <optional>
70
+ <ref name="reqtitle"/>
71
+ </optional>
72
+ <optional>
73
+ <ref name="label"/>
74
+ </optional>
75
+ <zeroOrMore>
76
+ <ref name="subject"/>
77
+ </zeroOrMore>
78
+ <zeroOrMore>
79
+ <ref name="reqinherit"/>
80
+ </zeroOrMore>
81
+ <zeroOrMore>
82
+ <ref name="classification"/>
83
+ </zeroOrMore>
84
+ <zeroOrMore>
85
+ <choice>
86
+ <ref name="measurementtarget"/>
87
+ <ref name="specification"/>
88
+ <ref name="verification"/>
89
+ <ref name="import"/>
90
+ <ref name="description"/>
91
+ <ref name="component"/>
92
+ </choice>
93
+ </zeroOrMore>
94
+ <optional>
95
+ <ref name="reqt_references"/>
96
+ </optional>
97
+ <zeroOrMore>
98
+ <choice>
99
+ <ref name="requirement"/>
100
+ <ref name="recommendation"/>
101
+ <ref name="permission"/>
102
+ </choice>
103
+ </zeroOrMore>
104
+ </define>
105
+ <define name="reqtitle">
106
+ <element name="title">
107
+ <ref name="FormattedString"/>
108
+ </element>
109
+ </define>
110
+ <define name="label">
111
+ <element name="label">
112
+ <oneOrMore>
113
+ <ref name="TextElement"/>
114
+ </oneOrMore>
115
+ </element>
116
+ </define>
117
+ <define name="subject">
118
+ <element name="subject">
119
+ <oneOrMore>
120
+ <ref name="TextElement"/>
121
+ </oneOrMore>
122
+ </element>
123
+ </define>
124
+ <define name="reqinherit">
125
+ <element name="inherit">
126
+ <oneOrMore>
127
+ <ref name="TextElement"/>
128
+ </oneOrMore>
129
+ </element>
130
+ </define>
131
+ <define name="measurementtarget">
132
+ <element name="measurement-target">
133
+ <ref name="RequirementSubpart"/>
134
+ </element>
135
+ </define>
136
+ <define name="specification">
137
+ <element name="specification">
138
+ <ref name="RequirementSubpart"/>
139
+ </element>
140
+ </define>
141
+ <define name="verification">
142
+ <element name="verification">
143
+ <ref name="RequirementSubpart"/>
144
+ </element>
145
+ </define>
146
+ <define name="import">
147
+ <element name="import">
148
+ <ref name="RequirementSubpart"/>
149
+ </element>
150
+ </define>
151
+ <define name="description">
152
+ <element name="description">
153
+ <ref name="RequirementSubpart"/>
154
+ </element>
155
+ </define>
156
+ <define name="component">
157
+ <element name="component">
158
+ <attribute name="class"/>
159
+ <ref name="RequirementSubpart"/>
160
+ </element>
161
+ </define>
162
+ <define name="reqt_references">
163
+ <element name="references">
164
+ <oneOrMore>
165
+ <ref name="bibitem"/>
166
+ </oneOrMore>
167
+ </element>
168
+ </define>
169
+ <define name="RequirementSubpart">
170
+ <optional>
171
+ <attribute name="type"/>
172
+ </optional>
173
+ <optional>
174
+ <attribute name="exclude">
175
+ <data type="boolean"/>
176
+ </attribute>
177
+ </optional>
178
+ <optional>
179
+ <attribute name="keep-with-next">
180
+ <data type="boolean"/>
181
+ </attribute>
182
+ </optional>
183
+ <optional>
184
+ <attribute name="keep-lines-together">
185
+ <data type="boolean"/>
186
+ </attribute>
187
+ </optional>
188
+ <optional>
189
+ <attribute name="tag"/>
190
+ </optional>
191
+ <optional>
192
+ <attribute name="multilingual-rendering">
193
+ <ref name="MultilingualRenderingType"/>
194
+ </attribute>
195
+ </optional>
196
+ <oneOrMore>
197
+ <ref name="BasicBlock"/>
198
+ </oneOrMore>
199
+ </define>
200
+ <define name="ObligationType">
201
+ <choice>
202
+ <value>requirement</value>
203
+ <value>recommendation</value>
204
+ <value>permission</value>
205
+ </choice>
206
+ </define>
207
+ <define name="classification">
208
+ <element name="classification">
209
+ <ref name="classification_tag"/>
210
+ <ref name="classification_value"/>
211
+ </element>
212
+ </define>
213
+ <define name="classification_tag">
214
+ <element name="tag">
215
+ <text/>
216
+ </element>
217
+ </define>
218
+ <define name="classification_value">
219
+ <element name="value">
220
+ <text/>
221
+ </element>
222
+ </define>
223
+ </grammar>
@@ -0,0 +1,91 @@
1
+ module RelatonOasis
2
+ class DataFetcher
3
+ #
4
+ # Initialize a new DataFetcher
5
+ #
6
+ # @param [Strin] output directory to save files, default: "data"
7
+ # @param [Strin] format format of output files (xml, yaml, bibxml); default: yaml
8
+ #
9
+ def initialize(output, format)
10
+ @output = output
11
+ @format = format
12
+ @ext = @format.sub(/^bib|^rfc/, "")
13
+ @files = []
14
+ end
15
+
16
+ #
17
+ # Initialize fetcher and run fetch
18
+ #
19
+ # @param [Strin] output directory to save files, default: "data"
20
+ # @param [Strin] format format of output files (xml, yaml, bibxml); default: yaml
21
+ #
22
+ def self.fetch(output: "data", format: "yaml")
23
+ t1 = Time.now
24
+ puts "Started at: #{t1}"
25
+ FileUtils.mkdir_p output unless Dir.exist? output
26
+ new(output, format).fetch
27
+ t2 = Time.now
28
+ puts "Stopped at: #{t2}"
29
+ puts "Done in: #{(t2 - t1).round} sec."
30
+ end
31
+
32
+ #
33
+ # Fetch and save all the documents from OASIS
34
+ #
35
+ def fetch
36
+ agent = Mechanize.new
37
+ resp = agent.get "https://www.oasis-open.org/standards/"
38
+ doc = Nokogiri::HTML resp.body
39
+ doc.xpath("//details").map do |item|
40
+ save_doc DataParser.new(item).parse
41
+ fetch_parts item
42
+ end
43
+ end
44
+
45
+ #
46
+ # Fetch and save parts of document
47
+ #
48
+ # @param [Nokogiri::HTML::Element] item document node
49
+ #
50
+ def fetch_parts(item)
51
+ parts = item.xpath("./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong]")
52
+ return unless parts.size > 1
53
+
54
+ parts.each do |part|
55
+ save_doc DataPartParser.new(part).parse
56
+ end
57
+ end
58
+
59
+ #
60
+ # Save document to file
61
+ #
62
+ # @param [RelatonOasis::OasisBibliographicItem] doc
63
+ #
64
+ def save_doc(doc) # rubocop:disable Metrics/MethodLength
65
+ c = case @format
66
+ when "xml" then doc.to_xml(bibdata: true)
67
+ when "yaml" then doc.to_hash.to_yaml
68
+ else doc.send("to_#{@format}")
69
+ end
70
+ file = file_name doc
71
+ if @files.include? file
72
+ warn "File #{file} already exists. Document: #{doc.docnumber}"
73
+ else
74
+ @files << file
75
+ end
76
+ File.write file, c, encoding: "UTF-8"
77
+ end
78
+
79
+ #
80
+ # Generate file name
81
+ #
82
+ # @param [RelatonOasis::OasisBibliographicItem] doc
83
+ #
84
+ # @return [String] file name
85
+ #
86
+ def file_name(doc)
87
+ name = doc.docnumber.gsub(/[\s,:\/]/, "_").squeeze("_").upcase
88
+ File.join @output, "#{name}.#{@ext}"
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,204 @@
1
+ module RelatonOasis
2
+ class DataParser
3
+ include RelatonOasis::DataParserUtils
4
+
5
+ #
6
+ # Initialize parser.
7
+ #
8
+ # @param [Nokogiri::HTML::Element] node docment node
9
+ #
10
+ def initialize(node)
11
+ @node = node
12
+ @title = @node.at("./summary/div/h2").text
13
+ @text = @node.at(
14
+ "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[em or i or a or span]",
15
+ )&.text
16
+ end
17
+
18
+ #
19
+ # Parse document.
20
+ #
21
+ # @return [RelatonOasis::OasisBibliographicItem] bibliographic item
22
+ #
23
+ def parse # rubocop:disable Metrics/MethodLength
24
+ RelatonOasis::OasisBibliographicItem.new(
25
+ fetched: Date.today.to_s,
26
+ type: "standard",
27
+ doctype: parse_doctype,
28
+ title: parse_title,
29
+ docid: parse_docid,
30
+ docnumber: parse_docnumber,
31
+ date: parse_date,
32
+ abstract: parse_abstract,
33
+ language: ["en"],
34
+ script: ["Latn"],
35
+ editorialgroup: parse_editorialgroup,
36
+ relation: parse_relation,
37
+ technology_area: parse_technology_area,
38
+ )
39
+ end
40
+
41
+ #
42
+ # Parse title.
43
+ #
44
+ # @return [Array<RelatonBib::TypedTitleString>] <description>
45
+ #
46
+ def parse_title
47
+ [RelatonBib::TypedTitleString.new(type: "main", content: @title, language: "en", script: "Latn")]
48
+ end
49
+
50
+ #
51
+ # Parse date.
52
+ #
53
+ # @return [Array<RelatonBib::BibliographicDate>] date
54
+ #
55
+ def parse_date
56
+ @node.xpath("./summary/div/time[@class='standard__date']").map do |d|
57
+ date_str = d.text.match(/\d{2}\s\w+\s\d{4}/).to_s
58
+ date = Date.parse(date_str).to_s
59
+ RelatonBib::BibliographicDate.new(on: date, type: "issued")
60
+ end
61
+ end
62
+
63
+ #
64
+ # Parse abstract.
65
+ #
66
+ # @return [Array<RelatonBib::FormattedString>] abstract
67
+ #
68
+ def parse_abstract
69
+ c = @node.xpath(
70
+ "./summary/div/div[@class='standard__description']/p",
71
+ ).map { |a| a.text.gsub(/[\n\t]+/, " ").strip }.join("\n")
72
+ return [] if c.empty?
73
+
74
+ [RelatonBib::FormattedString.new(content: c, language: "en", script: "Latn")]
75
+ end
76
+
77
+ #
78
+ # Parse technical committee.
79
+ #
80
+ # @return [RelatonBib::EditorialGroup] technical committee
81
+ #
82
+ def parse_editorialgroup
83
+ tc = @node.xpath("./div[@class='standard__details']/a").map do |a|
84
+ wg = RelatonBib::WorkGroup.new name: a.text.strip
85
+ RelatonBib::TechnicalCommittee.new wg
86
+ end
87
+ RelatonBib::EditorialGroup.new tc
88
+ end
89
+
90
+ #
91
+ # Parse relation.
92
+ #
93
+ # @return [Array<RelatonBib::DocumentRelation>] relation
94
+ #
95
+ def parse_relation
96
+ rels = @node.xpath(
97
+ "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong or b/span]",
98
+ )
99
+ return [] unless rels.size > 1
100
+
101
+ rels.map do |r|
102
+ docid = DataPartParser.new(r).parse_docid
103
+ fref = RelatonBib::FormattedRef.new content: docid[0].id
104
+ bib = RelatonOasis::OasisBibliographicItem.new formattedref: fref
105
+ RelatonBib::DocumentRelation.new type: "hasPart", bibitem: bib
106
+ end
107
+ end
108
+
109
+ #
110
+ # Parse document part references.
111
+ #
112
+ # @return [Array<String>] document part references
113
+ #
114
+ def document_part_refs
115
+ @node.css(
116
+ ".standard__grid--cite-as > p > strong",
117
+ "span.Refterm", "span.abbrev", "span.citationLabel > strong"
118
+ ).map { |p| p.text.gsub(/^\[{1,2}|\]$/, "").strip }
119
+ end
120
+
121
+ #
122
+ # Parse document number.
123
+ #
124
+ # @return [String] document number
125
+ #
126
+ def parse_docnumber
127
+ parts = document_part_refs
128
+ case parts.size
129
+ when 0 then title_to_docid @node.at("./summary/div/h2").text
130
+ when 1 then parse_spec(parts[0])
131
+ else parts_to_docid parts
132
+ end
133
+ end
134
+
135
+ #
136
+ # Create document identifier from parts references.
137
+ #
138
+ # @param [Array<String>] parts parts references
139
+ #
140
+ # @return [String] document identifier
141
+ #
142
+ def parts_to_docid(parts)
143
+ id = parts[1..-1].each_with_object(parts[0].split("-")) do |part, acc|
144
+ chunks = part.split "-"
145
+ chunks.each.with_index do |chunk, idx|
146
+ unless chunk.casecmp(acc[idx])&.zero?
147
+ acc.slice!(idx..-1)
148
+ break
149
+ end
150
+ end
151
+ end.join("-")
152
+ parse_spec(id)
153
+ end
154
+
155
+ #
156
+ # Create document identifier from title.
157
+ #
158
+ # @param [String] title title
159
+ #
160
+ # @return [String] document identifier
161
+ #
162
+ def title_to_docid(title) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
163
+ abbrs = title.scan(/(?<=\()[^)]+(?=\))/)
164
+ if abbrs.any?
165
+ id = abbrs.map { |abbr| abbr.split.join("-") }.join "-"
166
+ /(?:Version\s|v)(?<ver>[\d.]+)/ =~ title
167
+ id += "-v#{ver}" if ver
168
+ /(?<eb>ebXML|ebMS)/ =~ title
169
+ id = "#{eb}-#{id}" if eb
170
+ id
171
+ else
172
+ series_end = false
173
+ title.sub(/\s\[OASIS\s\d+\]$/, "").split(/[,:]?\s|-|(?<=[a-z])(?=[A-Z][a-z])/)
174
+ .each_with_object([""]) do |word, acc|
175
+ if word =~ /^v[\d.]+/
176
+ acc << $MATCH.to_s
177
+ series_end = true
178
+ elsif word.match?(/^Version/)
179
+ acc << "v"
180
+ series_end = false
181
+ elsif word.match?(/^\d|ebXML|ebMS/)
182
+ series_end ? acc << word : acc[-1] += word
183
+ series_end = true
184
+ elsif word.match?(/^\w+$/) && word == word.upcase
185
+ series_end ? acc << word : acc[-1] = word
186
+ series_end = true
187
+ elsif word.match?(/[A-Z]+[a-z]+/)
188
+ series_end ? acc << word[0] : acc[-1] += word[0]
189
+ series_end = false
190
+ end
191
+ end.join "-"
192
+ end
193
+ end
194
+
195
+ #
196
+ # Parse technology areas.
197
+ #
198
+ # @return [Array<String>] technology areas
199
+ #
200
+ def parse_technology_area
201
+ super @node
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,89 @@
1
+ module RelatonOasis
2
+ module DataParserUtils
3
+ #
4
+ # Parse document identifier specification.
5
+ #
6
+ # @param [String] num document number
7
+ #
8
+ # @return [String] document identifier with specification if needed
9
+ #
10
+ def parse_spec(num)
11
+ id = case @text
12
+ when /OASIS Project Specification (\d+)/ then "#{num}-PS#{$1}"
13
+ when /Committee Specification (\d+)/ then "#{num}-CS#{$1}"
14
+ else num
15
+ end
16
+ parse_part(id)
17
+ end
18
+
19
+ #
20
+ # Parse document identifier part.
21
+ #
22
+ # @param [<Type>] docid <description>
23
+ #
24
+ # @return [<Type>] <description>
25
+ #
26
+ def parse_part(docid)
27
+ return docid if docid.match?(/(?:Part|Pt)\d+/i)
28
+
29
+ id = case @title
30
+ when /Part\s(\d+)/ then "#{docid}-Pt#{$1}"
31
+ else docid
32
+ end
33
+ parse_errata(id)
34
+ end
35
+
36
+ #
37
+ # Parse document identifier errata.
38
+ #
39
+ # @param [String] id document identifier
40
+ #
41
+ # @return [String] document identifier with errata if needed
42
+ #
43
+ def parse_errata(id)
44
+ return id if id.match?(/errata\d+/i)
45
+
46
+ case @title
47
+ when /Plus\sErrata\s(\d+)/ then "#{id}-plus-errata#{$1}"
48
+ when /Errata\s(\d+)/ then "#{id}-errata#{$1}"
49
+ else id
50
+ end
51
+ end
52
+
53
+ #
54
+ # Parse document identifier.
55
+ #
56
+ # @return [Array<RelatonBib::DocumentIdentifier>] document identifier
57
+ #
58
+ def parse_docid
59
+ id = "OASIS #{parse_docnumber}"
60
+ [RelatonBib::DocumentIdentifier.new(type: "OASIS", id: id, primary: true)]
61
+ end
62
+
63
+ #
64
+ # Parse document type.
65
+ #
66
+ # @return [String] document type
67
+ #
68
+ def parse_doctype
69
+ case @text
70
+ when /OASIS Project Specification/, /Committee Specification/
71
+ "specification"
72
+ when /Technical Memorandum/ then "memorandum"
73
+ when /Technical Resolution/ then "resolution"
74
+ else "standard"
75
+ end
76
+ end
77
+
78
+ #
79
+ # Parse technology area.
80
+ #
81
+ # @return [Array<String>] technology areas
82
+ #
83
+ def parse_technology_area(node)
84
+ node.xpath(
85
+ "./summary/div/div/ul[@class='technology-areas__list']/li/a",
86
+ ).map { |ta| ta.text.strip.gsub(/\s/, "-") }
87
+ end
88
+ end
89
+ end