relaton-w3c 1.10.1 → 1.11.2

Sign up to get free protection for your applications and to get access to all the features.
data/grammars/isodoc.rng CHANGED
@@ -152,9 +152,7 @@
152
152
  <data type="boolean"/>
153
153
  </attribute>
154
154
  </optional>
155
- <oneOrMore>
156
- <ref name="PureTextElement"/>
157
- </oneOrMore>
155
+ <ref name="XrefBody"/>
158
156
  </element>
159
157
  </define>
160
158
  <define name="erefType">
@@ -188,6 +186,42 @@
188
186
  <ref name="PureTextElement"/>
189
187
  </oneOrMore>
190
188
  </define>
189
+ <define name="localityStack">
190
+ <element name="localityStack">
191
+ <optional>
192
+ <attribute name="connective">
193
+ <choice>
194
+ <value>and</value>
195
+ <value>or</value>
196
+ <value>from</value>
197
+ <value>to</value>
198
+ <value/>
199
+ </choice>
200
+ </attribute>
201
+ </optional>
202
+ <zeroOrMore>
203
+ <ref name="locality"/>
204
+ </zeroOrMore>
205
+ </element>
206
+ </define>
207
+ <define name="sourceLocalityStack">
208
+ <element name="sourceLocalityStack">
209
+ <optional>
210
+ <attribute name="connective">
211
+ <choice>
212
+ <value>and</value>
213
+ <value>or</value>
214
+ <value>from</value>
215
+ <value>to</value>
216
+ <value/>
217
+ </choice>
218
+ </attribute>
219
+ </optional>
220
+ <zeroOrMore>
221
+ <ref name="sourceLocality"/>
222
+ </zeroOrMore>
223
+ </element>
224
+ </define>
191
225
  <define name="ul">
192
226
  <element name="ul">
193
227
  <attribute name="id">
@@ -1098,6 +1132,16 @@
1098
1132
  </define>
1099
1133
  </include>
1100
1134
  <!-- end overrides -->
1135
+ <define name="image" combine="choice">
1136
+ <element name="svg">
1137
+ <oneOrMore>
1138
+ <choice>
1139
+ <text/>
1140
+ <ref name="AnyElement"/>
1141
+ </choice>
1142
+ </oneOrMore>
1143
+ </element>
1144
+ </define>
1101
1145
  <define name="MultilingualRenderingType">
1102
1146
  <choice>
1103
1147
  <value>common</value>
@@ -2631,4 +2675,30 @@
2631
2675
  </zeroOrMore>
2632
2676
  </element>
2633
2677
  </define>
2678
+ <define name="XrefBody">
2679
+ <zeroOrMore>
2680
+ <ref name="XrefTarget"/>
2681
+ </zeroOrMore>
2682
+ <oneOrMore>
2683
+ <ref name="PureTextElement"/>
2684
+ </oneOrMore>
2685
+ </define>
2686
+ <define name="XrefTarget">
2687
+ <element name="location">
2688
+ <attribute name="target">
2689
+ <data type="string">
2690
+ <param name="pattern">\i\c*|\c+#\c+</param>
2691
+ </data>
2692
+ </attribute>
2693
+ <attribute name="connective">
2694
+ <choice>
2695
+ <value>and</value>
2696
+ <value>or</value>
2697
+ <value>from</value>
2698
+ <value>to</value>
2699
+ <value/>
2700
+ </choice>
2701
+ </attribute>
2702
+ </element>
2703
+ </define>
2634
2704
  </grammar>
@@ -13,5 +13,12 @@ module RelatonW3c
13
13
  def pubid_type(_)
14
14
  "W3C"
15
15
  end
16
+
17
+ def docids(reference, ver)
18
+ ids = super
19
+ ids.reject! &:primary
20
+ id = "W3C #{reference[:target].split('/').last}"
21
+ ids.unshift RelatonBib::DocumentIdentifier.new(id: id, type: "W3C", primary: true)
22
+ end
16
23
  end
17
24
  end
@@ -0,0 +1,188 @@
1
+ require "rdf"
2
+ require "linkeddata"
3
+ require "sparql"
4
+ require "mechanize"
5
+ require "relaton_w3c/data_parser"
6
+
7
+ module RelatonW3c
8
+ class DataFetcher
9
+ attr_reader :data, :group_names
10
+
11
+ #
12
+ # Data fetcher initializer
13
+ #
14
+ # @param [String] output directory to save files
15
+ # @param [String] format format of output files (xml, yaml, bibxml)
16
+ #
17
+ def initialize(output, format)
18
+ @output = output
19
+ @format = format
20
+ @ext = format.sub(/^bib/, "")
21
+ dir = File.dirname(File.expand_path(__FILE__))
22
+ @group_names = YAML.load_file(File.join(dir, "workgroups.yaml"))
23
+ @data = RDF::Repository.load("http://www.w3.org/2002/01/tr-automation/tr.rdf")
24
+ @files = []
25
+ @index = DataIndex.new
26
+ end
27
+
28
+ #
29
+ # Initialize fetcher and run fetch
30
+ #
31
+ # @param [Strin] output directory to save files, default: "data"
32
+ # @param [Strin] format format of output files (xml, yaml, bibxml), default: yaml
33
+ #
34
+ def self.fetch(output: "data", format: "yaml")
35
+ t1 = Time.now
36
+ puts "Started at: #{t1}"
37
+ FileUtils.mkdir_p output unless Dir.exist? output
38
+ new(output, format).fetch
39
+ t2 = Time.now
40
+ puts "Stopped at: #{t2}"
41
+ puts "Done in: #{(t2 - t1).round} sec."
42
+ end
43
+
44
+ #
45
+ # Parse documents
46
+ #
47
+ def fetch # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
48
+ query_versioned_docs.each do |sl|
49
+ save_doc DataParser.parse(sl, self)
50
+ rescue StandardError => e
51
+ warn "Error: document #{sl.link} #{e.message}"
52
+ warn e.backtrace.join("\n")
53
+ end
54
+ query_unversioned_docs.each do |sl|
55
+ save_doc DataParser.parse(sl, self)
56
+ rescue StandardError => e
57
+ warn "Error: document #{sl.version_of} #{e.message}"
58
+ warn e.backtrace.join("\n")
59
+ end
60
+ Dir[File.expand_path("../../data/*", __dir__)].each do |file|
61
+ xml = File.read file, encoding: "UTF-8"
62
+ save_doc BibXMLParser.parse(xml), warn_duplicate: false
63
+ rescue StandardError => e
64
+ warn "Error: document #{file} #{e.message}"
65
+ warn e.backtrace.join("\n")
66
+ end
67
+ @index.sort!.save
68
+ end
69
+
70
+ #
71
+ # Create index file
72
+ #
73
+ # def create_index
74
+ # index_file = "index-w3c.yaml"
75
+ # index_yaml = @index.sort do |a, b|
76
+ # compare_index_items a, b
77
+ # end.to_yaml
78
+ # File.write index_file, index_yaml, encoding: "UTF-8"
79
+ # end
80
+
81
+ #
82
+ # Compare index items
83
+ #
84
+ # @param [Hash] aid first item
85
+ # @param [Hash] bid second item
86
+ #
87
+ # @return [Integer] comparison result
88
+ #
89
+ # def compare_index_items(aid, bid) # rubocop:disable Metrics/AbcSize
90
+ # ret = aid[:code] <=> bid[:code]
91
+ # ret = stage_weight(bid[:stage]) <=> stage_weight(aid[:stage]) if ret.zero?
92
+ # ret = date_weight(bid[:date]) <=> date_weight(aid[:date]) if ret.zero?
93
+ # # ret = aid[:type] <=> bid[:type] if ret.zero?
94
+ # ret
95
+ # end
96
+
97
+ #
98
+ # Weight of stage
99
+ #
100
+ # @param [String, nil] stage stage
101
+ #
102
+ # @return [Integer] weight
103
+ #
104
+ # def stage_weight(stage)
105
+ # return DataParser::STAGES.size if stage.nil?
106
+
107
+ # DataParser::STAGES.keys.index(stage)
108
+ # end
109
+
110
+ #
111
+ # Weight of date
112
+ #
113
+ # @param [String] date date
114
+ #
115
+ # @return [String] weight
116
+ #
117
+ # def date_weight(date)
118
+ # return "99999999" if date.nil?
119
+
120
+ # date
121
+ # end
122
+
123
+ #
124
+ # Query RDF source for documents
125
+ #
126
+ # @return [RDF::Query::Solutions] query results
127
+ #
128
+ def query_versioned_docs # rubocop:disable Metrics/MethodLength
129
+ sse = SPARQL.parse(%(
130
+ PREFIX : <http://www.w3.org/2001/02pd/rec54#>
131
+ PREFIX dc: <http://purl.org/dc/elements/1.1/>
132
+ PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
133
+ # PREFIX mat: <http://www.w3.org/2002/05/matrix/vocab#>
134
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
135
+ SELECT ?link ?title ?date ?version_of
136
+ WHERE {
137
+ ?link dc:title ?title ; dc:date ?date ; doc:versionOf ?version_of .
138
+ }
139
+ ))
140
+ data.query sse
141
+ end
142
+
143
+ def query_unversioned_docs
144
+ sse = SPARQL.parse(%(
145
+ PREFIX doc: <http://www.w3.org/2000/10/swap/pim/doc#>
146
+ SELECT ?version_of
147
+ WHERE { ?x doc:versionOf ?version_of . }
148
+ ))
149
+ data.query(sse).uniq &:version_of
150
+ end
151
+
152
+ #
153
+ # Save document to file
154
+ #
155
+ # @param [RelatonW3c::W3cBibliographicItem, nil] bib bibliographic item
156
+ #
157
+ def save_doc(bib, warn_duplicate: true) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
158
+ return unless bib
159
+
160
+ c = case @format
161
+ when "xml" then bib.to_xml(bibdata: true)
162
+ when "yaml" then bib.to_hash.to_yaml
163
+ else bib.send("to_#{@format}")
164
+ end
165
+ # id = bib.docidentifier.detect(&:primary)&.id || bib.formattedref.content
166
+ file = file_name(bib.docnumber)
167
+ if @files.include?(file)
168
+ warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
169
+ else
170
+ @index.add bib.docnumber, file
171
+ @files << file
172
+ File.write file, c, encoding: "UTF-8"
173
+ end
174
+ end
175
+
176
+ #
177
+ # Generate file name
178
+ #
179
+ # @param [String] id document id
180
+ #
181
+ # @return [String] file name
182
+ #
183
+ def file_name(id)
184
+ name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase
185
+ File.join @output, "#{name}.#{@ext}"
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,143 @@
1
+ module RelatonW3c
2
+ class DataIndex
3
+ #
4
+ # Initialize data index.
5
+ #
6
+ # @param [String] index_file path to index file
7
+ # @param [Array<Hash>] index index data
8
+ #
9
+ def initialize(index_file: "index-w3c.yaml", index: [])
10
+ @index_file = index_file
11
+ @index = index
12
+ end
13
+
14
+ #
15
+ # Create index from a GitHub repository
16
+ #
17
+ # @return [RelatonW3c::DataIndex] data index
18
+ #
19
+ def self.create_from_repo
20
+ resp_index = Net::HTTP.get(URI("#{W3cBibliography::SOURCE}index-w3c.yaml"))
21
+
22
+ # Newer versions of Psych uses the `permitted_classes:` parameter
23
+ index = if YAML.method(:safe_load).parameters.collect(&:last).index(:permitted_classes)
24
+ YAML.safe_load(resp_index, permitted_classes: [Symbol])
25
+ else
26
+ YAML.safe_load(resp_index, [Symbol])
27
+ end
28
+
29
+ DataIndex.new index: index
30
+ end
31
+
32
+ #
33
+ # Add document to index
34
+ #
35
+ # @param [String] docnumber document number
36
+ # @param [String] file path to document file
37
+ #
38
+ def add(docnumber, file)
39
+ @index << docnumber_to_parts(docnumber, file)
40
+ end
41
+
42
+ #
43
+ # Save index to file.
44
+ #
45
+ def save
46
+ File.write @index_file, @index.to_yaml, encoding: "UTF-8"
47
+ end
48
+
49
+ #
50
+ # Sort index
51
+ #
52
+ # @return [Array<Hash>] sorted index
53
+ #
54
+ def sort!
55
+ @index.sort! { |a, b| compare_index_items a, b }
56
+ self
57
+ end
58
+
59
+ #
60
+ # Search filename in index
61
+ #
62
+ # @param [String] ref reference
63
+ #
64
+ # @return [String] document's filename
65
+ #
66
+ def search(ref) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
67
+ dparts = docnumber_to_parts(ref)
68
+ @index.detect do |parts|
69
+ parts[:code].match?(/^#{Regexp.escape dparts[:code]}/i) &&
70
+ (dparts[:stage].nil? || dparts[:stage].casecmp?(parts[:stage])) &&
71
+ (dparts[:type].nil? || dparts[:type].casecmp?(parts[:type])) &&
72
+ (dparts[:date].nil? || dparts[:date] == parts[:date]) &&
73
+ (dparts[:suff].nil? || dparts[:suff].casecmp?(parts[:suff]))
74
+ end&.fetch(:file)
75
+ end
76
+
77
+ #
78
+ # Compare index items
79
+ #
80
+ # @param [Hash] aid first item
81
+ # @param [Hash] bid second item
82
+ #
83
+ # @return [Integer] comparison result
84
+ #
85
+ def compare_index_items(aid, bid) # rubocop:disable Metrics/AbcSize
86
+ ret = aid[:code] <=> bid[:code]
87
+ ret = stage_weight(bid[:stage]) <=> stage_weight(aid[:stage]) if ret.zero?
88
+ ret = date_weight(bid[:date]) <=> date_weight(aid[:date]) if ret.zero?
89
+ # ret = aid[:type] <=> bid[:type] if ret.zero?
90
+ ret
91
+ end
92
+
93
+ #
94
+ # Weight of stage
95
+ #
96
+ # @param [String, nil] stage stage
97
+ #
98
+ # @return [Integer] weight
99
+ #
100
+ def stage_weight(stage)
101
+ return DataParser::STAGES.size if stage.nil?
102
+
103
+ DataParser::STAGES.keys.index(stage)
104
+ end
105
+
106
+ #
107
+ # Weight of date
108
+ #
109
+ # @param [String] date date
110
+ #
111
+ # @return [String] weight
112
+ #
113
+ def date_weight(date)
114
+ return "99999999" if date.nil?
115
+
116
+ date
117
+ end
118
+
119
+ #
120
+ # Parse document number to parts
121
+ #
122
+ # @param [String] docnumber document number
123
+ # @param [String, nil] file path to document file
124
+ #
125
+ # @return [Hash{Symbol=>String}] document parts
126
+ #
127
+ def docnumber_to_parts(docnumber, file = nil) # rubocop:disable Metrics/MethodLength
128
+ %r{
129
+ ^(?:(?:(?<stage>WD|CRD|CR|PR|PER|REC|SPSD|OBSL|RET)|(?<type>D?NOTE))-)?
130
+ (?<code>\w+(?:[+-][\w.]+)*?)
131
+ (?:-(?<date>\d{8}|\d{6}))?
132
+ (?:/(?<suff>\w+))?$
133
+ }xi =~ docnumber
134
+ entry = { code: code }
135
+ entry[:file] = file if file
136
+ entry[:stage] = stage if stage
137
+ entry[:type] = type if type
138
+ entry[:date] = date if date
139
+ entry[:suff] = suff if suff
140
+ entry
141
+ end
142
+ end
143
+ end