djnml 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +19 -0
- data/LICENSE.txt +24 -0
- data/README.rdoc +35 -0
- data/Rakefile +49 -0
- data/VERSION +1 -0
- data/djnml.gemspec +79 -0
- data/lib/djnml/codes.rb +3127 -0
- data/lib/djnml/delete.rb +13 -0
- data/lib/djnml/modification.rb +244 -0
- data/lib/djnml.rb +417 -0
- data/spec/data/20120716155056208LL000587.NML +28 -0
- data/spec/data/20120716161436878LL001634.NML +117 -0
- data/spec/data/20120716162053366LL005062.NML +115 -0
- data/spec/data/20120720222918942LL007284.NML +26 -0
- data/spec/data/DN20080506000741.nml +72 -0
- data/spec/data/DN20080506000785.nml +212 -0
- data/spec/data/DN20080506000839.nml +57 -0
- data/spec/djnml_codes_spec.rb +47 -0
- data/spec/djnml_spec.rb +477 -0
- data/spec/spec_helper.rb +12 -0
- metadata +157 -0
data/lib/djnml/delete.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
class DJNML
|
2
|
+
class Delete
|
3
|
+
attr_reader :product, :doc_date, :seq, :publisher, :reason
|
4
|
+
|
5
|
+
def initialize(args = {})
|
6
|
+
@product = args[:product] if args[:product]
|
7
|
+
@doc_date = Time.parse(args[:doc_date]) if args[:doc_date]
|
8
|
+
@seq = args[:seq].to_i if args[:seq]
|
9
|
+
@publisher = args[:publisher] if args[:publisher]
|
10
|
+
@reason = args[:reason] if args[:reason]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,244 @@
|
|
1
|
+
# Copyright (c) 2012, Tobias Begalke
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# Redistribution and use in source and binary forms, with or without
|
5
|
+
# modification, are permitted provided that the following conditions are met:
|
6
|
+
# * Redistributions of source code must retain the above copyright
|
7
|
+
# notice, this list of conditions and the following disclaimer.
|
8
|
+
# * Redistributions in binary form must reproduce the above copyright
|
9
|
+
# notice, this list of conditions and the following disclaimer in the
|
10
|
+
# documentation and/or other materials provided with the distribution.
|
11
|
+
# * Neither the name of the <organization> nor the
|
12
|
+
# names of its contributors may be used to endorse or promote products
|
13
|
+
# derived from this software without specific prior written permission.
|
14
|
+
|
15
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
16
|
+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
19
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
|
26
|
+
|
27
|
+
class DJNML
|
28
|
+
class Modification
|
29
|
+
attr_reader :publisher, :doc_date, :product, :seq, :xpath, :mdata,
|
30
|
+
:headline, :text, :urgency, :press_cutout, :summary
|
31
|
+
|
32
|
+
def initialize(args = {})
|
33
|
+
|
34
|
+
@publisher = args[:publisher] if args[:publisher]
|
35
|
+
@doc_date = Time.parse(args[:doc_date]) if args[:doc_date]
|
36
|
+
@product = args[:product] if args[:product]
|
37
|
+
@seq = args[:seq].to_i if args[:seq]
|
38
|
+
xml = args[:xml] if args[:xml]
|
39
|
+
|
40
|
+
if xml && xml.is_a?(Nokogiri::XML::Element)
|
41
|
+
@xpath = xml['xpath']
|
42
|
+
|
43
|
+
if mdata = xml.search('djn-mdata').to_a.first
|
44
|
+
@mdata = Mdata.new(mdata)
|
45
|
+
end
|
46
|
+
|
47
|
+
if headline = xml.search('headline').to_a.first
|
48
|
+
@headline = headline.text.strip
|
49
|
+
end
|
50
|
+
|
51
|
+
if text = xml.search('text').to_a.first
|
52
|
+
@text = XMLText.new(text)
|
53
|
+
end
|
54
|
+
|
55
|
+
if text = xml.search('summary').to_a.first
|
56
|
+
@summary = XMLText.new(text)
|
57
|
+
end
|
58
|
+
|
59
|
+
if press = xml.search('djn-press-cutout').to_a.first
|
60
|
+
@press_cutout = press.text.strip
|
61
|
+
end
|
62
|
+
|
63
|
+
if urgency = xml.search('djn-urgency').to_a.first
|
64
|
+
@urgency = urgency.text.strip
|
65
|
+
end
|
66
|
+
else
|
67
|
+
@publisher = args['publisher'] if args['publisher']
|
68
|
+
@doc_date = Time.parse(args['doc_date']) if args['doc_date']
|
69
|
+
@product = args['product'] if args['product']
|
70
|
+
@seq = args['seq'].to_i if args['seq']
|
71
|
+
@mdata = Mdata.new(args['mdata']) if args['mdata']
|
72
|
+
@headline = args['headline'] if args['headline']
|
73
|
+
@text = XMLText.new(args['text']) if args['text']
|
74
|
+
@summary = XMLText.new(args['summary']) if args['summary']
|
75
|
+
@press_cutout = args['press_cutout'] if args['press_cutout']
|
76
|
+
@urgency = args['urgency'] if args['urgency']
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
def fields_to_modify
|
82
|
+
fields = []
|
83
|
+
[:mdata, :headline, :text, :urgency, :press_cutout, :summary].each do |f|
|
84
|
+
if self.send(f)
|
85
|
+
fields << f
|
86
|
+
end
|
87
|
+
end
|
88
|
+
fields
|
89
|
+
end
|
90
|
+
|
91
|
+
class XMLText
|
92
|
+
attr_reader :text, :html
|
93
|
+
|
94
|
+
def initialize(data)
|
95
|
+
if data.is_a?(Nokogiri::XML::Element)
|
96
|
+
@text = data.children.text.strip
|
97
|
+
@html = data.children.to_xml
|
98
|
+
elsif data.is_a?(Hash)
|
99
|
+
@text = data['text']
|
100
|
+
@html = data['html']
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_s
|
105
|
+
@text.to_s
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
class Mdata
|
111
|
+
attr_reader :company_code, :isin_code, :industry_code, :government_code,
|
112
|
+
:page_code, :subject_code, :market_code, :product_code,
|
113
|
+
:geo_code, :stat_code, :journal_code, :routing_code,
|
114
|
+
:content_code, :function_code
|
115
|
+
|
116
|
+
def self.from_hash(data)
|
117
|
+
self.new(data) if data.is_a?(Hash)
|
118
|
+
end
|
119
|
+
|
120
|
+
def initialize(data = nil)
|
121
|
+
return unless data
|
122
|
+
|
123
|
+
initialize_from_xml(data) if data.is_a?(Nokogiri::XML::Element)
|
124
|
+
initialize_from_hash(data) if data.is_a?(Hash)
|
125
|
+
end
|
126
|
+
|
127
|
+
def initialize_from_hash(data)
|
128
|
+
@company_code = data['company_code']
|
129
|
+
@isin_code = data['isin_code']
|
130
|
+
@page_code = data['page_code']
|
131
|
+
@industry_code = data['industry_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
132
|
+
@government_code = data['government_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
133
|
+
@subject_code = data['subject_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
134
|
+
@market_code = data['market_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
135
|
+
@geo_code = data['geo_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
136
|
+
@stat_code = data['stat_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
137
|
+
@journal_code = data['journal_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
138
|
+
@routing_code = data['routing_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
139
|
+
@function_code = data['function_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
140
|
+
@product_code = data['product_code'].map { |c| ::DJNML::Codes.new(c['symbol']) }
|
141
|
+
end
|
142
|
+
|
143
|
+
def initialize_from_xml(xml)
|
144
|
+
# company
|
145
|
+
#
|
146
|
+
if tag = xml.search('djn-coding/djn-company/c')
|
147
|
+
@company_code = tag.map { |tag| tag.text.strip }
|
148
|
+
tag = nil
|
149
|
+
end
|
150
|
+
|
151
|
+
# isin
|
152
|
+
#
|
153
|
+
if tag = xml.search('djn-coding/djn-isin/c')
|
154
|
+
@isin_code = tag.map { |tag| tag.text.strip }
|
155
|
+
tag = nil
|
156
|
+
end
|
157
|
+
|
158
|
+
# industry
|
159
|
+
#
|
160
|
+
if tag = xml.search('djn-coding/djn-industry/c')
|
161
|
+
@industry_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
162
|
+
tag = nil
|
163
|
+
end
|
164
|
+
|
165
|
+
# government
|
166
|
+
#
|
167
|
+
if tag = xml.search('djn-coding/djn-government/c')
|
168
|
+
@government_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
169
|
+
tag = nil
|
170
|
+
end
|
171
|
+
|
172
|
+
# page
|
173
|
+
#
|
174
|
+
if tag = xml.search('djn-coding/djn-page/c')
|
175
|
+
@page_code = tag.map { tag.text.strip }
|
176
|
+
tag = nil
|
177
|
+
end
|
178
|
+
|
179
|
+
# subject
|
180
|
+
#
|
181
|
+
if tag = xml.search('djn-coding/djn-subject/c')
|
182
|
+
@subject_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
183
|
+
tag = nil
|
184
|
+
end
|
185
|
+
|
186
|
+
# market
|
187
|
+
#
|
188
|
+
if tag = xml.search('djn-coding/djn-market/c')
|
189
|
+
@market_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
190
|
+
tag = nil
|
191
|
+
end
|
192
|
+
|
193
|
+
# product
|
194
|
+
#
|
195
|
+
if tag = xml.search('djn-coding/djn-product/c')
|
196
|
+
@product_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
197
|
+
tag = nil
|
198
|
+
end
|
199
|
+
|
200
|
+
# geo
|
201
|
+
#
|
202
|
+
if tag = xml.search('djn-coding/djn-geo/c')
|
203
|
+
@geo_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
204
|
+
tag = nil
|
205
|
+
end
|
206
|
+
|
207
|
+
# stat
|
208
|
+
#
|
209
|
+
if tag = xml.search('djn-coding/djn-stat/c')
|
210
|
+
@stat_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
211
|
+
tag = nil
|
212
|
+
end
|
213
|
+
|
214
|
+
# journal
|
215
|
+
#
|
216
|
+
if tag = xml.search('djn-coding/djn-journal/c')
|
217
|
+
@journal_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
218
|
+
tag = nil
|
219
|
+
end
|
220
|
+
|
221
|
+
# routing
|
222
|
+
#
|
223
|
+
if tag = xml.search('djn-coding/djn-routing/c')
|
224
|
+
@routing_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
225
|
+
tag = nil
|
226
|
+
end
|
227
|
+
|
228
|
+
# content
|
229
|
+
#
|
230
|
+
if tag = xml.search('djn-coding/djn-content/c')
|
231
|
+
@content_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
232
|
+
tag = nil
|
233
|
+
end
|
234
|
+
|
235
|
+
# function
|
236
|
+
#
|
237
|
+
if tag = xml.search('djn-coding/djn-function/c')
|
238
|
+
@function_code = tag.map { |tag| ::DJNML::Codes.new(tag.text.strip) }
|
239
|
+
tag = nil
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
data/lib/djnml.rb
ADDED
@@ -0,0 +1,417 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Copyright (c) 2012, Tobias Begalke
|
4
|
+
# All rights reserved.
|
5
|
+
#
|
6
|
+
# Redistribution and use in source and binary forms, with or without
|
7
|
+
# modification, are permitted provided that the following conditions are met:
|
8
|
+
# * Redistributions of source code must retain the above copyright
|
9
|
+
# notice, this list of conditions and the following disclaimer.
|
10
|
+
# * Redistributions in binary form must reproduce the above copyright
|
11
|
+
# notice, this list of conditions and the following disclaimer in the
|
12
|
+
# documentation and/or other materials provided with the distribution.
|
13
|
+
# * Neither the name of the <organization> nor the
|
14
|
+
# names of its contributors may be used to endorse or promote products
|
15
|
+
# derived from this software without specific prior written permission.
|
16
|
+
|
17
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
18
|
+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
19
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
20
|
+
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
|
21
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
22
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
23
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
24
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
25
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
26
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
27
|
+
|
28
|
+
|
29
|
+
require 'nokogiri'
|
30
|
+
require 'date'
|
31
|
+
require 'language_detector'
|
32
|
+
require 'djnml/codes'
|
33
|
+
require 'djnml/delete'
|
34
|
+
require 'djnml/modification'
|
35
|
+
|
36
|
+
class DJNML
|
37
|
+
|
38
|
+
attr_reader :msize, :md5, :sys_id, :destination, :dist_id, :transmission_date,
|
39
|
+
:publisher, :doc_date, :product, :seq, :lang,
|
40
|
+
:news_source, :origin, :service_id,
|
41
|
+
:urgency,
|
42
|
+
:brand, :temp_perm, :retention, :hot, :original_source,
|
43
|
+
:accession_number, :display_date, :page_citation,
|
44
|
+
:company_code, :isin_code, :industry_code, :page_code,
|
45
|
+
:government_code, :stat_code, :journal_code, :routing_code,
|
46
|
+
:content_code, :function_code, :subject_code, :market_code,
|
47
|
+
:product_code, :geo_code,
|
48
|
+
:headline, :headline_brand, :text, :html, :language,
|
49
|
+
:copyright_year, :copyright_holder,
|
50
|
+
:website, :company_name, :company_address, :company_zip, :company_city,
|
51
|
+
:delete, :modifications
|
52
|
+
|
53
|
+
|
54
|
+
def self.load(filename)
|
55
|
+
|
56
|
+
if filename
|
57
|
+
if ! File.exists?(filename)
|
58
|
+
raise FileError.new("#{filename}: no such file!")
|
59
|
+
end
|
60
|
+
|
61
|
+
obj = self.new
|
62
|
+
obj.load(filename)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def load(filename)
|
67
|
+
if ! File.exists?(filename)
|
68
|
+
raise FileError.new("#{filename}: no such file!")
|
69
|
+
end
|
70
|
+
|
71
|
+
parser = Nokogiri::XML(open(filename))
|
72
|
+
|
73
|
+
# doc tag
|
74
|
+
#
|
75
|
+
begin
|
76
|
+
doc = parser.search('/doc').first
|
77
|
+
@msize = doc['msize'].to_i
|
78
|
+
@md5 = doc['md5']
|
79
|
+
@sys_id = doc['sysId']
|
80
|
+
@destination = doc['destination']
|
81
|
+
@dist_id = doc['distId']
|
82
|
+
@transmission_date = Time.parse(doc['transmission-date'])
|
83
|
+
rescue
|
84
|
+
# ignore errors
|
85
|
+
end
|
86
|
+
|
87
|
+
doc = nil
|
88
|
+
|
89
|
+
# djnml tag
|
90
|
+
#
|
91
|
+
begin
|
92
|
+
djnml = parser.search('/doc/djnml').first
|
93
|
+
@publisher = djnml['publisher']
|
94
|
+
@doc_date = Time.parse(djnml['docdate'])
|
95
|
+
@product = djnml['product']
|
96
|
+
@seq = djnml['seq'].to_i
|
97
|
+
@lang = djnml['lang']
|
98
|
+
rescue
|
99
|
+
# ignore errors
|
100
|
+
end
|
101
|
+
|
102
|
+
djnml = nil
|
103
|
+
|
104
|
+
# djn-newswires tag
|
105
|
+
#
|
106
|
+
begin
|
107
|
+
newswires = parser.search('/doc/djnml/head/docdata/djn/djn-newswires').first
|
108
|
+
@news_source = newswires['news-source']
|
109
|
+
@origin = newswires['origin']
|
110
|
+
@service_id = newswires['service-id']
|
111
|
+
rescue
|
112
|
+
# ignore errors
|
113
|
+
end
|
114
|
+
|
115
|
+
newswires = nil
|
116
|
+
|
117
|
+
# djn-press-cutout tag
|
118
|
+
#
|
119
|
+
presscutout = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-press-cutout').first
|
120
|
+
presscutout = nil
|
121
|
+
|
122
|
+
# djn-urgency tag
|
123
|
+
#
|
124
|
+
begin
|
125
|
+
urgency = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-urgency').first
|
126
|
+
@urgency = urgency.text.strip.squeeze.to_i
|
127
|
+
rescue
|
128
|
+
# ignore errors
|
129
|
+
end
|
130
|
+
|
131
|
+
urgency = nil
|
132
|
+
|
133
|
+
|
134
|
+
# djn-mdata
|
135
|
+
#
|
136
|
+
begin
|
137
|
+
mdata = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata').first
|
138
|
+
@brand = mdata['brand']
|
139
|
+
@temp_perm = mdata['temp-perm']
|
140
|
+
@retention = mdata['retention']
|
141
|
+
@hot = mdata['hot']
|
142
|
+
@original_source = mdata['original-source']
|
143
|
+
@accession_number = mdata['accession-number']
|
144
|
+
@page_citation = mdata['page-citation']
|
145
|
+
@display_date = Time.parse(mdata['display-date'])
|
146
|
+
rescue
|
147
|
+
# ignore errors
|
148
|
+
end
|
149
|
+
|
150
|
+
mdata = nil
|
151
|
+
|
152
|
+
# coding / company
|
153
|
+
#
|
154
|
+
begin
|
155
|
+
ccompany = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-company/c')
|
156
|
+
@company_code = ccompany.map { |tag| tag.text.strip }
|
157
|
+
rescue
|
158
|
+
# ignore errors
|
159
|
+
end
|
160
|
+
|
161
|
+
ccompany = nil
|
162
|
+
|
163
|
+
# coding / isin
|
164
|
+
#
|
165
|
+
begin
|
166
|
+
isin = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-isin/c')
|
167
|
+
@isin_code = isin.map { |tag| tag.text.strip }
|
168
|
+
|
169
|
+
rescue
|
170
|
+
# ignore errors
|
171
|
+
end
|
172
|
+
|
173
|
+
isin = nil
|
174
|
+
|
175
|
+
# coding / page
|
176
|
+
#
|
177
|
+
begin
|
178
|
+
page = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-page/c')
|
179
|
+
@page_code = page.map { |tag| tag.text.strip }
|
180
|
+
|
181
|
+
rescue
|
182
|
+
# ignore errors
|
183
|
+
end
|
184
|
+
|
185
|
+
page = nil
|
186
|
+
|
187
|
+
|
188
|
+
# coding / industry
|
189
|
+
#
|
190
|
+
begin
|
191
|
+
industry = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-industry/c')
|
192
|
+
@industry_code = industry.map { |tag| Codes.new(tag.text.strip) }
|
193
|
+
rescue
|
194
|
+
# ignore errors
|
195
|
+
end
|
196
|
+
|
197
|
+
industry = nil
|
198
|
+
|
199
|
+
# coding / government
|
200
|
+
#
|
201
|
+
begin
|
202
|
+
government = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-government/c')
|
203
|
+
@government_code = government.map { |tag| Codes.new(tag.text.strip) }
|
204
|
+
rescue
|
205
|
+
# ignore errors
|
206
|
+
end
|
207
|
+
|
208
|
+
government = nil
|
209
|
+
|
210
|
+
|
211
|
+
# coding / subject
|
212
|
+
#
|
213
|
+
begin
|
214
|
+
subject = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-subject/c')
|
215
|
+
@subject_code = subject.map { |tag| Codes.new(tag.text.strip) }
|
216
|
+
rescue
|
217
|
+
# ignore errors
|
218
|
+
end
|
219
|
+
|
220
|
+
subject = nil
|
221
|
+
|
222
|
+
# coding / market
|
223
|
+
#
|
224
|
+
begin
|
225
|
+
market = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-market/c')
|
226
|
+
@market_code = market.map { |tag| Codes.new(tag.text.strip) }
|
227
|
+
rescue
|
228
|
+
# ignore errors
|
229
|
+
end
|
230
|
+
|
231
|
+
market = nil
|
232
|
+
|
233
|
+
# coding / product
|
234
|
+
#
|
235
|
+
begin
|
236
|
+
product = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-product/c')
|
237
|
+
@product_code = product.map { |tag| Codes.new(tag.text.strip) }
|
238
|
+
rescue
|
239
|
+
# ignore errors
|
240
|
+
end
|
241
|
+
|
242
|
+
product = nil
|
243
|
+
|
244
|
+
# coding / geo
|
245
|
+
#
|
246
|
+
begin
|
247
|
+
geo = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-geo/c')
|
248
|
+
@geo_code = geo.map { |tag| Codes.new(tag.text.strip) }
|
249
|
+
rescue
|
250
|
+
# ignore errors
|
251
|
+
end
|
252
|
+
|
253
|
+
geo = nil
|
254
|
+
|
255
|
+
# coding / stat
|
256
|
+
#
|
257
|
+
begin
|
258
|
+
stat = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-stat/c')
|
259
|
+
@stat_code = stat.map { |tag| Codes.new(tag.text.strip) }
|
260
|
+
rescue
|
261
|
+
# ignore errors
|
262
|
+
end
|
263
|
+
|
264
|
+
stat = nil
|
265
|
+
|
266
|
+
|
267
|
+
# coding / journal
|
268
|
+
#
|
269
|
+
begin
|
270
|
+
journal = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-journal/c')
|
271
|
+
@journal_code = journal.map { |tag| Codes.new(tag.text.strip) }
|
272
|
+
rescue
|
273
|
+
# ignore errors
|
274
|
+
end
|
275
|
+
|
276
|
+
journal = nil
|
277
|
+
|
278
|
+
|
279
|
+
# coding / routing
|
280
|
+
#
|
281
|
+
begin
|
282
|
+
routing = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-routing/c')
|
283
|
+
@routing_code = routing.map { |tag| Codes.new(tag.text.strip) }
|
284
|
+
rescue
|
285
|
+
# ignore errors
|
286
|
+
end
|
287
|
+
|
288
|
+
routing = nil
|
289
|
+
|
290
|
+
# coding / content
|
291
|
+
#
|
292
|
+
begin
|
293
|
+
content = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-content/c')
|
294
|
+
@content_code = content.map { |tag| Codes.new(tag.text.strip) }
|
295
|
+
rescue
|
296
|
+
# ignore errors
|
297
|
+
end
|
298
|
+
|
299
|
+
content = nil
|
300
|
+
|
301
|
+
# coding / function
|
302
|
+
#
|
303
|
+
begin
|
304
|
+
function = parser.search('/doc/djnml/head/docdata/djn/djn-newswires/djn-mdata/djn-coding/djn-function/c')
|
305
|
+
@function_code = function.map { |tag| Codes.new(tag.text.strip) }
|
306
|
+
rescue
|
307
|
+
# ignore errors
|
308
|
+
end
|
309
|
+
|
310
|
+
function = nil
|
311
|
+
|
312
|
+
|
313
|
+
# body / headline
|
314
|
+
#
|
315
|
+
begin
|
316
|
+
headline = parser.search('/doc/djnml/body/headline').first
|
317
|
+
@headline = headline.text.strip
|
318
|
+
@headline_brand = headline['brand-display'] if headline['brand-display']
|
319
|
+
rescue
|
320
|
+
# ignore errors
|
321
|
+
end
|
322
|
+
|
323
|
+
headline = nil
|
324
|
+
|
325
|
+
# body / text
|
326
|
+
#
|
327
|
+
begin
|
328
|
+
text = parser.search('/doc/djnml/body/text').first
|
329
|
+
@html = text.children.to_xml
|
330
|
+
@text = text.children.text.strip
|
331
|
+
rescue
|
332
|
+
# ignore errors
|
333
|
+
end
|
334
|
+
|
335
|
+
text = nil
|
336
|
+
|
337
|
+
# copyright
|
338
|
+
#
|
339
|
+
begin
|
340
|
+
copyright = parser.search('/doc/djnml/head/copyright').first
|
341
|
+
@copyright_year = copyright['year'].to_s.strip.to_i
|
342
|
+
@copyright_holder = copyright['holder']
|
343
|
+
rescue
|
344
|
+
# ignore errors
|
345
|
+
end
|
346
|
+
|
347
|
+
copyright = nil
|
348
|
+
|
349
|
+
# website
|
350
|
+
#
|
351
|
+
begin
|
352
|
+
if @text =~ /Internet:\s+(.+?)$/
|
353
|
+
@website = $1.strip
|
354
|
+
end
|
355
|
+
rescue
|
356
|
+
# ignore errors
|
357
|
+
end
|
358
|
+
|
359
|
+
if @text =~ /Company:\s+(\S.+?)\s*\n+\s+(\b.+?)\n+\s+(\d+)\s+(\b.+?)\n+/
|
360
|
+
@company_name = $1.strip
|
361
|
+
@company_address= $2.strip
|
362
|
+
@company_zip = $3.strip
|
363
|
+
@company_city = $4.strip
|
364
|
+
end
|
365
|
+
|
366
|
+
# language
|
367
|
+
#
|
368
|
+
begin
|
369
|
+
@language = LanguageDetector.instance.detect(@text)
|
370
|
+
rescue
|
371
|
+
# ignore errors
|
372
|
+
end
|
373
|
+
|
374
|
+
# stories to delete
|
375
|
+
#
|
376
|
+
begin
|
377
|
+
@delete = []
|
378
|
+
doc_delete = parser.search('/doc/djnml/administration/doc-delete')
|
379
|
+
doc_delete.each do |dd|
|
380
|
+
@delete << Delete.new(:product => dd['product'],
|
381
|
+
:doc_date => dd['docdate'],
|
382
|
+
:seq => dd['seq'],
|
383
|
+
:publisher => dd['publisher'],
|
384
|
+
:reason => dd['reason'])
|
385
|
+
end
|
386
|
+
rescue
|
387
|
+
# ignore errors
|
388
|
+
end
|
389
|
+
|
390
|
+
# replacements
|
391
|
+
#
|
392
|
+
@modifications = []
|
393
|
+
# begin
|
394
|
+
doc_modify = parser.search('/doc/djnml/administration/doc-modify').first
|
395
|
+
|
396
|
+
mods = parser.search('/doc/djnml/administration/doc-modify/modify-replace')
|
397
|
+
mods.each do |m|
|
398
|
+
@modifications << Modification.new(:doc_date => doc_modify['docdate'],
|
399
|
+
:product => doc_modify['product'],
|
400
|
+
:publisher => doc_modify['publisher'],
|
401
|
+
:seq => doc_modify['seq'],
|
402
|
+
:xml => m)
|
403
|
+
end
|
404
|
+
# rescue Exception => e
|
405
|
+
# ignore errors
|
406
|
+
# end
|
407
|
+
|
408
|
+
self
|
409
|
+
end
|
410
|
+
|
411
|
+
def has_content?
|
412
|
+
! self.text.nil?
|
413
|
+
end
|
414
|
+
|
415
|
+
class FileError < Exception
|
416
|
+
end
|
417
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
2
|
+
<!DOCTYPE doc SYSTEM "djnml-1.0b.dtd">
|
3
|
+
<doc msize="000001968" md5="88d754f61ba4361c72a6a6dd0d2d00d5" sysId="sbknwsdcmn4p1" destination="AW" distId="NHP1" transmission-date="20120716T135050Z" >
|
4
|
+
<djnml publisher="DJN" docdate="20120713" product="LL" seq="587" xml:lang="en-us" >
|
5
|
+
<administration>
|
6
|
+
<doc-delete product="LL" docdate="20110608" seq="001579" publisher="DJN" reason="expire" />
|
7
|
+
<doc-delete product="LL" docdate="20110608" seq="001580" publisher="DJN" reason="expire" />
|
8
|
+
<doc-delete product="LL" docdate="20110608" seq="001581" publisher="DJN" reason="expire" />
|
9
|
+
<doc-delete product="LL" docdate="20110608" seq="001582" publisher="DJN" reason="expire" />
|
10
|
+
<doc-delete product="LL" docdate="20110608" seq="001583" publisher="DJN" reason="expire" />
|
11
|
+
<doc-delete product="LL" docdate="20110608" seq="001584" publisher="DJN" reason="expire" />
|
12
|
+
<doc-delete product="LL" docdate="20110608" seq="001585" publisher="DJN" reason="expire" />
|
13
|
+
<doc-delete product="LL" docdate="20110608" seq="001586" publisher="DJN" reason="expire" />
|
14
|
+
<doc-delete product="LL" docdate="20110608" seq="001587" publisher="DJN" reason="expire" />
|
15
|
+
<doc-delete product="LL" docdate="20110608" seq="001588" publisher="DJN" reason="expire" />
|
16
|
+
<doc-delete product="LL" docdate="20110608" seq="001589" publisher="DJN" reason="expire" />
|
17
|
+
<doc-delete product="LL" docdate="20110608" seq="001590" publisher="DJN" reason="expire" />
|
18
|
+
<doc-delete product="LL" docdate="20110608" seq="001591" publisher="DJN" reason="expire" />
|
19
|
+
<doc-delete product="LL" docdate="20110608" seq="001592" publisher="DJN" reason="expire" />
|
20
|
+
<doc-delete product="LL" docdate="20110608" seq="001593" publisher="DJN" reason="expire" />
|
21
|
+
<doc-delete product="LL" docdate="20110608" seq="001594" publisher="DJN" reason="expire" />
|
22
|
+
<doc-delete product="LL" docdate="20110608" seq="001595" publisher="DJN" reason="expire" />
|
23
|
+
<doc-delete product="LL" docdate="20110608" seq="001596" publisher="DJN" reason="expire" />
|
24
|
+
<doc-delete product="LL" docdate="20110608" seq="001597" publisher="DJN" reason="expire" />
|
25
|
+
<doc-delete product="LL" docdate="20110608" seq="001598" publisher="DJN" reason="expire" />
|
26
|
+
</administration>
|
27
|
+
</djnml>
|
28
|
+
</doc>
|