relaton-oasis 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/grammars/reqt.rng ADDED
@@ -0,0 +1,223 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
3
+ <!--
4
+ Presupposes isodoc.rnc, is included in it
5
+ include "isodoc.rnc" { }
6
+ -->
7
+ <define name="requirement">
8
+ <element name="requirement">
9
+ <ref name="RequirementType"/>
10
+ </element>
11
+ </define>
12
+ <define name="recommendation">
13
+ <element name="recommendation">
14
+ <ref name="RequirementType"/>
15
+ </element>
16
+ </define>
17
+ <define name="permission">
18
+ <element name="permission">
19
+ <ref name="RequirementType"/>
20
+ </element>
21
+ </define>
22
+ <define name="RequirementType">
23
+ <optional>
24
+ <attribute name="obligation">
25
+ <ref name="ObligationType"/>
26
+ </attribute>
27
+ </optional>
28
+ <optional>
29
+ <attribute name="unnumbered">
30
+ <data type="boolean"/>
31
+ </attribute>
32
+ </optional>
33
+ <optional>
34
+ <attribute name="number"/>
35
+ </optional>
36
+ <optional>
37
+ <attribute name="subsequence"/>
38
+ </optional>
39
+ <optional>
40
+ <attribute name="keep-with-next">
41
+ <data type="boolean"/>
42
+ </attribute>
43
+ </optional>
44
+ <optional>
45
+ <attribute name="keep-lines-together">
46
+ <data type="boolean"/>
47
+ </attribute>
48
+ </optional>
49
+ <attribute name="id">
50
+ <data type="ID"/>
51
+ </attribute>
52
+ <optional>
53
+ <attribute name="filename"/>
54
+ </optional>
55
+ <optional>
56
+ <attribute name="model"/>
57
+ </optional>
58
+ <optional>
59
+ <attribute name="type"/>
60
+ </optional>
61
+ <optional>
62
+ <attribute name="tag"/>
63
+ </optional>
64
+ <optional>
65
+ <attribute name="multilingual-rendering">
66
+ <ref name="MultilingualRenderingType"/>
67
+ </attribute>
68
+ </optional>
69
+ <optional>
70
+ <ref name="reqtitle"/>
71
+ </optional>
72
+ <optional>
73
+ <ref name="label"/>
74
+ </optional>
75
+ <zeroOrMore>
76
+ <ref name="subject"/>
77
+ </zeroOrMore>
78
+ <zeroOrMore>
79
+ <ref name="reqinherit"/>
80
+ </zeroOrMore>
81
+ <zeroOrMore>
82
+ <ref name="classification"/>
83
+ </zeroOrMore>
84
+ <zeroOrMore>
85
+ <choice>
86
+ <ref name="measurementtarget"/>
87
+ <ref name="specification"/>
88
+ <ref name="verification"/>
89
+ <ref name="import"/>
90
+ <ref name="description"/>
91
+ <ref name="component"/>
92
+ </choice>
93
+ </zeroOrMore>
94
+ <optional>
95
+ <ref name="reqt_references"/>
96
+ </optional>
97
+ <zeroOrMore>
98
+ <choice>
99
+ <ref name="requirement"/>
100
+ <ref name="recommendation"/>
101
+ <ref name="permission"/>
102
+ </choice>
103
+ </zeroOrMore>
104
+ </define>
105
+ <define name="reqtitle">
106
+ <element name="title">
107
+ <ref name="FormattedString"/>
108
+ </element>
109
+ </define>
110
+ <define name="label">
111
+ <element name="label">
112
+ <oneOrMore>
113
+ <ref name="TextElement"/>
114
+ </oneOrMore>
115
+ </element>
116
+ </define>
117
+ <define name="subject">
118
+ <element name="subject">
119
+ <oneOrMore>
120
+ <ref name="TextElement"/>
121
+ </oneOrMore>
122
+ </element>
123
+ </define>
124
+ <define name="reqinherit">
125
+ <element name="inherit">
126
+ <oneOrMore>
127
+ <ref name="TextElement"/>
128
+ </oneOrMore>
129
+ </element>
130
+ </define>
131
+ <define name="measurementtarget">
132
+ <element name="measurement-target">
133
+ <ref name="RequirementSubpart"/>
134
+ </element>
135
+ </define>
136
+ <define name="specification">
137
+ <element name="specification">
138
+ <ref name="RequirementSubpart"/>
139
+ </element>
140
+ </define>
141
+ <define name="verification">
142
+ <element name="verification">
143
+ <ref name="RequirementSubpart"/>
144
+ </element>
145
+ </define>
146
+ <define name="import">
147
+ <element name="import">
148
+ <ref name="RequirementSubpart"/>
149
+ </element>
150
+ </define>
151
+ <define name="description">
152
+ <element name="description">
153
+ <ref name="RequirementSubpart"/>
154
+ </element>
155
+ </define>
156
+ <define name="component">
157
+ <element name="component">
158
+ <attribute name="class"/>
159
+ <ref name="RequirementSubpart"/>
160
+ </element>
161
+ </define>
162
+ <define name="reqt_references">
163
+ <element name="references">
164
+ <oneOrMore>
165
+ <ref name="bibitem"/>
166
+ </oneOrMore>
167
+ </element>
168
+ </define>
169
+ <define name="RequirementSubpart">
170
+ <optional>
171
+ <attribute name="type"/>
172
+ </optional>
173
+ <optional>
174
+ <attribute name="exclude">
175
+ <data type="boolean"/>
176
+ </attribute>
177
+ </optional>
178
+ <optional>
179
+ <attribute name="keep-with-next">
180
+ <data type="boolean"/>
181
+ </attribute>
182
+ </optional>
183
+ <optional>
184
+ <attribute name="keep-lines-together">
185
+ <data type="boolean"/>
186
+ </attribute>
187
+ </optional>
188
+ <optional>
189
+ <attribute name="tag"/>
190
+ </optional>
191
+ <optional>
192
+ <attribute name="multilingual-rendering">
193
+ <ref name="MultilingualRenderingType"/>
194
+ </attribute>
195
+ </optional>
196
+ <oneOrMore>
197
+ <ref name="BasicBlock"/>
198
+ </oneOrMore>
199
+ </define>
200
+ <define name="ObligationType">
201
+ <choice>
202
+ <value>requirement</value>
203
+ <value>recommendation</value>
204
+ <value>permission</value>
205
+ </choice>
206
+ </define>
207
+ <define name="classification">
208
+ <element name="classification">
209
+ <ref name="classification_tag"/>
210
+ <ref name="classification_value"/>
211
+ </element>
212
+ </define>
213
+ <define name="classification_tag">
214
+ <element name="tag">
215
+ <text/>
216
+ </element>
217
+ </define>
218
+ <define name="classification_value">
219
+ <element name="value">
220
+ <text/>
221
+ </element>
222
+ </define>
223
+ </grammar>
@@ -0,0 +1,91 @@
1
+ module RelatonOasis
2
+ class DataFetcher
3
+ #
4
+ # Initialize a new DataFetcher
5
+ #
6
+ # @param [Strin] output directory to save files, default: "data"
7
+ # @param [Strin] format format of output files (xml, yaml, bibxml); default: yaml
8
+ #
9
+ def initialize(output, format)
10
+ @output = output
11
+ @format = format
12
+ @ext = @format.sub(/^bib|^rfc/, "")
13
+ @files = []
14
+ end
15
+
16
+ #
17
+ # Initialize fetcher and run fetch
18
+ #
19
+ # @param [Strin] output directory to save files, default: "data"
20
+ # @param [Strin] format format of output files (xml, yaml, bibxml); default: yaml
21
+ #
22
+ def self.fetch(output: "data", format: "yaml")
23
+ t1 = Time.now
24
+ puts "Started at: #{t1}"
25
+ FileUtils.mkdir_p output unless Dir.exist? output
26
+ new(output, format).fetch
27
+ t2 = Time.now
28
+ puts "Stopped at: #{t2}"
29
+ puts "Done in: #{(t2 - t1).round} sec."
30
+ end
31
+
32
+ #
33
+ # Fetch and save all the documents from OASIS
34
+ #
35
+ def fetch
36
+ agent = Mechanize.new
37
+ resp = agent.get "https://www.oasis-open.org/standards/"
38
+ doc = Nokogiri::HTML resp.body
39
+ doc.xpath("//details").map do |item|
40
+ save_doc DataParser.new(item).parse
41
+ fetch_parts item
42
+ end
43
+ end
44
+
45
+ #
46
+ # Fetch and save parts of document
47
+ #
48
+ # @param [Nokogiri::HTML::Element] item document node
49
+ #
50
+ def fetch_parts(item)
51
+ parts = item.xpath("./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong]")
52
+ return unless parts.size > 1
53
+
54
+ parts.each do |part|
55
+ save_doc DataPartParser.new(part).parse
56
+ end
57
+ end
58
+
59
+ #
60
+ # Save document to file
61
+ #
62
+ # @param [RelatonOasis::OasisBibliographicItem] doc
63
+ #
64
+ def save_doc(doc) # rubocop:disable Metrics/MethodLength
65
+ c = case @format
66
+ when "xml" then doc.to_xml(bibdata: true)
67
+ when "yaml" then doc.to_hash.to_yaml
68
+ else doc.send("to_#{@format}")
69
+ end
70
+ file = file_name doc
71
+ if @files.include? file
72
+ warn "File #{file} already exists. Document: #{doc.docnumber}"
73
+ else
74
+ @files << file
75
+ end
76
+ File.write file, c, encoding: "UTF-8"
77
+ end
78
+
79
+ #
80
+ # Generate file name
81
+ #
82
+ # @param [RelatonOasis::OasisBibliographicItem] doc
83
+ #
84
+ # @return [String] file name
85
+ #
86
+ def file_name(doc)
87
+ name = doc.docnumber.gsub(/[\s,:\/]/, "_").squeeze("_").upcase
88
+ File.join @output, "#{name}.#{@ext}"
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,204 @@
1
+ module RelatonOasis
2
+ class DataParser
3
+ include RelatonOasis::DataParserUtils
4
+
5
+ #
6
+ # Initialize parser.
7
+ #
8
+ # @param [Nokogiri::HTML::Element] node docment node
9
+ #
10
+ def initialize(node)
11
+ @node = node
12
+ @title = @node.at("./summary/div/h2").text
13
+ @text = @node.at(
14
+ "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[em or i or a or span]",
15
+ )&.text
16
+ end
17
+
18
+ #
19
+ # Parse document.
20
+ #
21
+ # @return [RelatonOasis::OasisBibliographicItem] bibliographic item
22
+ #
23
+ def parse # rubocop:disable Metrics/MethodLength
24
+ RelatonOasis::OasisBibliographicItem.new(
25
+ fetched: Date.today.to_s,
26
+ type: "standard",
27
+ doctype: parse_doctype,
28
+ title: parse_title,
29
+ docid: parse_docid,
30
+ docnumber: parse_docnumber,
31
+ date: parse_date,
32
+ abstract: parse_abstract,
33
+ language: ["en"],
34
+ script: ["Latn"],
35
+ editorialgroup: parse_editorialgroup,
36
+ relation: parse_relation,
37
+ technology_area: parse_technology_area,
38
+ )
39
+ end
40
+
41
+ #
42
+ # Parse title.
43
+ #
44
+ # @return [Array<RelatonBib::TypedTitleString>] <description>
45
+ #
46
+ def parse_title
47
+ [RelatonBib::TypedTitleString.new(type: "main", content: @title, language: "en", script: "Latn")]
48
+ end
49
+
50
+ #
51
+ # Parse date.
52
+ #
53
+ # @return [Array<RelatonBib::BibliographicDate>] date
54
+ #
55
+ def parse_date
56
+ @node.xpath("./summary/div/time[@class='standard__date']").map do |d|
57
+ date_str = d.text.match(/\d{2}\s\w+\s\d{4}/).to_s
58
+ date = Date.parse(date_str).to_s
59
+ RelatonBib::BibliographicDate.new(on: date, type: "issued")
60
+ end
61
+ end
62
+
63
+ #
64
+ # Parse abstract.
65
+ #
66
+ # @return [Array<RelatonBib::FormattedString>] abstract
67
+ #
68
+ def parse_abstract
69
+ c = @node.xpath(
70
+ "./summary/div/div[@class='standard__description']/p",
71
+ ).map { |a| a.text.gsub(/[\n\t]+/, " ").strip }.join("\n")
72
+ return [] if c.empty?
73
+
74
+ [RelatonBib::FormattedString.new(content: c, language: "en", script: "Latn")]
75
+ end
76
+
77
+ #
78
+ # Parse technical committee.
79
+ #
80
+ # @return [RelatonBib::EditorialGroup] technical committee
81
+ #
82
+ def parse_editorialgroup
83
+ tc = @node.xpath("./div[@class='standard__details']/a").map do |a|
84
+ wg = RelatonBib::WorkGroup.new name: a.text.strip
85
+ RelatonBib::TechnicalCommittee.new wg
86
+ end
87
+ RelatonBib::EditorialGroup.new tc
88
+ end
89
+
90
+ #
91
+ # Parse relation.
92
+ #
93
+ # @return [Array<RelatonBib::DocumentRelation>] relation
94
+ #
95
+ def parse_relation
96
+ rels = @node.xpath(
97
+ "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong or b/span]",
98
+ )
99
+ return [] unless rels.size > 1
100
+
101
+ rels.map do |r|
102
+ docid = DataPartParser.new(r).parse_docid
103
+ fref = RelatonBib::FormattedRef.new content: docid[0].id
104
+ bib = RelatonOasis::OasisBibliographicItem.new formattedref: fref
105
+ RelatonBib::DocumentRelation.new type: "hasPart", bibitem: bib
106
+ end
107
+ end
108
+
109
+ #
110
+ # Parse document part references.
111
+ #
112
+ # @return [Array<String>] document part references
113
+ #
114
+ def document_part_refs
115
+ @node.css(
116
+ ".standard__grid--cite-as > p > strong",
117
+ "span.Refterm", "span.abbrev", "span.citationLabel > strong"
118
+ ).map { |p| p.text.gsub(/^\[{1,2}|\]$/, "").strip }
119
+ end
120
+
121
+ #
122
+ # Parse document number.
123
+ #
124
+ # @return [String] document number
125
+ #
126
+ def parse_docnumber
127
+ parts = document_part_refs
128
+ case parts.size
129
+ when 0 then title_to_docid @node.at("./summary/div/h2").text
130
+ when 1 then parse_spec(parts[0])
131
+ else parts_to_docid parts
132
+ end
133
+ end
134
+
135
+ #
136
+ # Create document identifier from parts references.
137
+ #
138
+ # @param [Array<String>] parts parts references
139
+ #
140
+ # @return [String] document identifier
141
+ #
142
+ def parts_to_docid(parts)
143
+ id = parts[1..-1].each_with_object(parts[0].split("-")) do |part, acc|
144
+ chunks = part.split "-"
145
+ chunks.each.with_index do |chunk, idx|
146
+ unless chunk.casecmp(acc[idx])&.zero?
147
+ acc.slice!(idx..-1)
148
+ break
149
+ end
150
+ end
151
+ end.join("-")
152
+ parse_spec(id)
153
+ end
154
+
155
+ #
156
+ # Create document identifier from title.
157
+ #
158
+ # @param [String] title title
159
+ #
160
+ # @return [String] document identifier
161
+ #
162
+ def title_to_docid(title) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
163
+ abbrs = title.scan(/(?<=\()[^)]+(?=\))/)
164
+ if abbrs.any?
165
+ id = abbrs.map { |abbr| abbr.split.join("-") }.join "-"
166
+ /(?:Version\s|v)(?<ver>[\d.]+)/ =~ title
167
+ id += "-v#{ver}" if ver
168
+ /(?<eb>ebXML|ebMS)/ =~ title
169
+ id = "#{eb}-#{id}" if eb
170
+ id
171
+ else
172
+ series_end = false
173
+ title.sub(/\s\[OASIS\s\d+\]$/, "").split(/[,:]?\s|-|(?<=[a-z])(?=[A-Z][a-z])/)
174
+ .each_with_object([""]) do |word, acc|
175
+ if word =~ /^v[\d.]+/
176
+ acc << $MATCH.to_s
177
+ series_end = true
178
+ elsif word.match?(/^Version/)
179
+ acc << "v"
180
+ series_end = false
181
+ elsif word.match?(/^\d|ebXML|ebMS/)
182
+ series_end ? acc << word : acc[-1] += word
183
+ series_end = true
184
+ elsif word.match?(/^\w+$/) && word == word.upcase
185
+ series_end ? acc << word : acc[-1] = word
186
+ series_end = true
187
+ elsif word.match?(/[A-Z]+[a-z]+/)
188
+ series_end ? acc << word[0] : acc[-1] += word[0]
189
+ series_end = false
190
+ end
191
+ end.join "-"
192
+ end
193
+ end
194
+
195
+ #
196
+ # Parse technology areas.
197
+ #
198
+ # @return [Array<String>] technology areas
199
+ #
200
+ def parse_technology_area
201
+ super @node
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,89 @@
1
+ module RelatonOasis
2
+ module DataParserUtils
3
+ #
4
+ # Parse document identifier specification.
5
+ #
6
+ # @param [String] num document number
7
+ #
8
+ # @return [String] document identifier with specification if needed
9
+ #
10
+ def parse_spec(num)
11
+ id = case @text
12
+ when /OASIS Project Specification (\d+)/ then "#{num}-PS#{$1}"
13
+ when /Committee Specification (\d+)/ then "#{num}-CS#{$1}"
14
+ else num
15
+ end
16
+ parse_part(id)
17
+ end
18
+
19
+ #
20
+ # Parse document identifier part.
21
+ #
22
+ # @param [<Type>] docid <description>
23
+ #
24
+ # @return [<Type>] <description>
25
+ #
26
+ def parse_part(docid)
27
+ return docid if docid.match?(/(?:Part|Pt)\d+/i)
28
+
29
+ id = case @title
30
+ when /Part\s(\d+)/ then "#{docid}-Pt#{$1}"
31
+ else docid
32
+ end
33
+ parse_errata(id)
34
+ end
35
+
36
+ #
37
+ # Parse document identifier errata.
38
+ #
39
+ # @param [String] id document identifier
40
+ #
41
+ # @return [String] document identifier with errata if needed
42
+ #
43
+ def parse_errata(id)
44
+ return id if id.match?(/errata\d+/i)
45
+
46
+ case @title
47
+ when /Plus\sErrata\s(\d+)/ then "#{id}-plus-errata#{$1}"
48
+ when /Errata\s(\d+)/ then "#{id}-errata#{$1}"
49
+ else id
50
+ end
51
+ end
52
+
53
+ #
54
+ # Parse document identifier.
55
+ #
56
+ # @return [Array<RelatonBib::DocumentIdentifier>] document identifier
57
+ #
58
+ def parse_docid
59
+ id = "OASIS #{parse_docnumber}"
60
+ [RelatonBib::DocumentIdentifier.new(type: "OASIS", id: id, primary: true)]
61
+ end
62
+
63
+ #
64
+ # Parse document type.
65
+ #
66
+ # @return [String] document type
67
+ #
68
+ def parse_doctype
69
+ case @text
70
+ when /OASIS Project Specification/, /Committee Specification/
71
+ "specification"
72
+ when /Technical Memorandum/ then "memorandum"
73
+ when /Technical Resolution/ then "resolution"
74
+ else "standard"
75
+ end
76
+ end
77
+
78
+ #
79
+ # Parse technology area.
80
+ #
81
+ # @return [Array<String>] technology areas
82
+ #
83
+ def parse_technology_area(node)
84
+ node.xpath(
85
+ "./summary/div/div/ul[@class='technology-areas__list']/li/a",
86
+ ).map { |ta| ta.text.strip.gsub(/\s/, "-") }
87
+ end
88
+ end
89
+ end