vortex_client 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'vortex_client'
5
+ require 'uri'
6
+ require 'nokogiri'
7
+ require 'htmlentities'
8
+ require 'json'
9
+ require 'iconv'
10
+
11
+ class MigrateHeroPublications
12
+ attr :vortex, :uri
13
+
14
+ def initialize(url)
15
+ @vortex = Vortex::Connection.new(url,:use_osx_keychain => true)
16
+ @uri = URI.parse(url)
17
+ end
18
+
19
+ def migrate_publications(url)
20
+ doc = Nokogiri::HTML.parse(open(url))
21
+ doc.encoding = 'utf-8'
22
+ doc.xpath("//td").each do |element|
23
+ if(element.inner_text =~ /\d*:\d*/)then
24
+ puts element.inner_text
25
+ puts "-------"
26
+ end
27
+ end
28
+ end
29
+
30
+ end
31
+
32
+ # Scrape all webpages found in src_url and store in dest_url
33
+ dest_url = 'https://nyweb1-dav.uio.no/konv/hero/publikasjoner'
34
+ # src_url = 'http://www.hero.uio.no/publicat/2003/'
35
+ # src_url = 'http://www.hero.uio.no/nyheter.html'
36
+ src_url = 'http://www.hero.uio.no/publications_all/publications10.html'
37
+ migration = MigrateHeroPublications.new(dest_url)
38
+ migration.migrate_publications(src_url)
@@ -0,0 +1,351 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'vortex_client'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'pry'
8
+ require "net/http"
9
+ require 'pathname'
10
+ require 'json'
11
+ require 'pp'
12
+
13
+ # Scrape content from the Norwegian Center for Studies of Holocaust and Religious Minorities'
14
+ # website http://www.hlsenteret.no/ and re-publish content to University of Oslo's CMS
15
+ # using the WebDAV API.
16
+
17
+ # TODO
18
+ # ok - Hente ut tittel på mappen
19
+ # ok - Hente ut content/type på bilder
20
+ # ok - Kopiere over ingress bilde
21
+ # ok - Luke ut rare tegn i overskriftene
22
+ # ok - Kjøre alle dokumenter uten kræsj
23
+ # ok - Kopiere over flere bilder
24
+ # ok - Få logging til fil til å fungere
25
+ # - Logg publisering til fil
26
+ # - Hvorfor konverteres og publiseres /konv/kunnskapsbasen/-a-hrefhttp-.html
27
+ # - Håndtere /konv/kunnskapsbasen/hl-senterets-kunnskapsbase.html spesielt?
28
+
29
+ @vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
30
+
31
+ # Simple logger
32
+ def log(str)
33
+ puts str
34
+ File.open("scrape_holocaust.log", 'a') do |f|
35
+ f.write( Time.now.iso8601 + ";" + str + "\n" )
36
+ end
37
+ end
38
+
39
+ def http_content_type(url)
40
+ uri = URI.parse(url)
41
+ http = Net::HTTP.new(uri.host, uri.port)
42
+
43
+ request = Net::HTTP::Get.new(uri.request_uri)
44
+ request["User-Agent"] = "My Ruby Script"
45
+ request["Accept"] = "*/*"
46
+
47
+ response = http.request(request)
48
+ return response['content-type']
49
+ end
50
+
51
+ def scrape_folder_title(url)
52
+ begin
53
+ doc = Nokogiri::HTML.parse(open(scrape_url))
54
+ doc.encoding = 'utf-8'
55
+
56
+ title = nil
57
+ begin
58
+ title = doc.css(".folder .title").first.inner_html
59
+ rescue
60
+ end
61
+ if(title == nil)then
62
+ title = doc.css(".article .title").first.inner_html
63
+ end
64
+ rescue
65
+ end
66
+ return title
67
+ end
68
+
69
+ def create_path(dest_path)
70
+ destination_path = "/"
71
+ dest_path.split("/").each do |folder|
72
+ if(folder != "")then
73
+ folder = folder.downcase
74
+ destination_path = destination_path + folder + "/"
75
+
76
+ if( not(@vortex.exists?(destination_path)) )then
77
+
78
+ puts "Creating folder " + destination_path
79
+
80
+ uri = URI.parse(@url)
81
+
82
+ title = scrape_folder_title( 'http://' + uri.host + destination_path.gsub('/konv','') )
83
+
84
+ if(title)then
85
+ title = title[0..0].upcase + title[1..title.length]
86
+ puts "Mappetittel: " + title.to_s
87
+ end
88
+
89
+ @vortex.mkdir(destination_path)
90
+ @vortex.proppatch(destination_path,'<v:collection-type xmlns:v="vrtx">article-listing</v:collection-type>')
91
+ if(title)then
92
+ @vortex.proppatch(destination_path,'<v:userTitle xmlns:v="vrtx">' + title.to_s + '</v:userTitle>')
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+
100
+ # Return an array images and captions
101
+ def scrape_images(doc)
102
+ images = []
103
+ i = 0
104
+ doc.css(".imageSeriesImage").each do |item|
105
+ url = item.attr("src")
106
+ caption = doc.css(".imageText")[i].text
107
+ images << { :url => url, :caption => caption }
108
+ i = i + 1
109
+ end
110
+ return images
111
+ end
112
+
113
+
114
+ # Resize an image using the unix command line utility 'sips' available on osx
115
+ def resize_image(content, content_type, size)
116
+ filename = "/tmp/" + (1 + rand(10000000)).to_s + "." + content_type
117
+ filename_resized = "/tmp/" + (1 + rand(10000000)).to_s + "_resized." + content_type
118
+ File.open(filename, 'w') do |f|
119
+ f.write( content)
120
+ end
121
+ result = %x[sips --resampleWidth #{size} #{filename} --out #{filename_resized}]
122
+ content_resized = IO.readlines(filename_resized,'r').to_s
123
+ return content_resized
124
+ end
125
+
126
+
127
+ # Download graphic to dest_path and return absolute filename
128
+ def download_image(src_url,dest_path)
129
+ src_url = src_url.gsub(/\?.*/,'')
130
+ content_type = http_content_type(src_url)
131
+ content_type = content_type.gsub("image/", "").gsub("jpeg","jpg")
132
+ content = open(src_url).read
133
+ basename = Pathname.new(src_url).basename.to_s.gsub(/\..*/,'')
134
+ vortex_url = dest_path + basename + "." + content_type
135
+ vortex_url = vortex_url.downcase
136
+ begin
137
+ @vortex.put_string(vortex_url, content)
138
+ rescue Exception => e
139
+ puts e.message
140
+ pp e.backtrace.inspect
141
+ puts "vortex_url: " + vortex_url
142
+ exit
143
+ end
144
+
145
+ # Store a resized image to vortex
146
+ puts "Nedskalerer bilde: " + src_url
147
+ content_resized = resize_image(content, content_type,300)
148
+ vortex_url_resized = dest_path + basename + "_width_300." + content_type
149
+
150
+ vortex_url_resized = vortex_url_resized.downcase
151
+ begin
152
+ @vortex.put_string(vortex_url_resized, content_resized)
153
+ rescue Exception => e
154
+ puts e.message
155
+ pp e.backtrace.inspect
156
+ puts "vortex_url_resized: " + vortex_url_resized
157
+ exit
158
+ end
159
+
160
+ return { :vortex_url_resized => vortex_url_resized, :vortex_url => vortex_url}
161
+ end
162
+
163
+
164
+ def publish_article(path, title, introduction, body, doc)
165
+ dest_path = "/konv" + path.gsub(/\d*$/,'')
166
+
167
+ if(not(dest_path =~ /\/$/))then
168
+ dest_path = dest_path + "/"
169
+ end
170
+
171
+ puts "Publishing to: '#{dest_path}'"
172
+
173
+ create_path(dest_path)
174
+ @vortex.cd(dest_path)
175
+
176
+ # images = []
177
+ # Comment out this line to prevent scraper from downloading images
178
+ images = scrape_images(doc)
179
+
180
+ images.each do |image|
181
+ filenames = download_image(image[:url], dest_path)
182
+ image[:vortex_url] = filenames[:vortex_url_resized]
183
+ image[:vortex_url_org] = filenames[:vortex_url]
184
+ end
185
+
186
+ url = dest_path + Vortex::StringUtils.create_filename(title.gsub(":","_")).gsub(/-$/,'') + '.html'
187
+ url = url.downcase
188
+
189
+ attributes = {:title => title,
190
+ :introduction => introduction,
191
+ :body => body,
192
+ :publishedDate => Time.now,
193
+ :url => url}
194
+
195
+ if(images and images.first)then
196
+ attributes[:picture] = images.first[:vortex_url]
197
+ end
198
+
199
+ article = Vortex::StructuredArticle.new(attributes )
200
+ begin
201
+ url = @vortex.publish(article)
202
+ rescue Exception => e
203
+ puts e.message
204
+ require 'pp'
205
+ pp e.backtrace.inspect
206
+ pp attributes
207
+ puts "Path: " + dest_path
208
+ exit
209
+ end
210
+
211
+ # Add additional images to bottom of page
212
+ images_html = ""
213
+ if(images and images.size > 1)then
214
+ images[1..images.size].each do |image|
215
+ image_html = <<EOF
216
+ <p>
217
+ <div class="vrtx-introduction-image" style="width: 300px; ">
218
+ <a title="Last ned bilde i full størrelse" href="#{image[:vortex_url_org]}">
219
+ <img src="#{image[:vortex_url]}" style="width: 300px;" />
220
+ </a>
221
+ <div class="vrtx-imagetext">
222
+ <div class="vrtx-imagedescription">
223
+ #{image[:caption]}
224
+ </div>
225
+ </div>
226
+ </div>
227
+ </p>
228
+ EOF
229
+ images_html = images_html + image_html
230
+ end
231
+ end
232
+
233
+ # Reopen document and set caption on article image
234
+ if(images and images.first)then
235
+ @vortex.find(url) do |item|
236
+ data = JSON.parse(item.content)
237
+
238
+ caption = ""
239
+ if(images.first[:caption])then
240
+ caption = images.first[:caption]
241
+ end
242
+ caption = caption + " <a href=\"#{images.first[:vortex_url_org]}\">Last ned i full størrelse</a>"
243
+ data["properties"]["caption"] = caption
244
+
245
+ # Add addiitional images at bottom
246
+ data["properties"]["content"] = data["properties"]["content"] + images_html
247
+ item.content = data.to_json
248
+ end
249
+ end
250
+
251
+ return url
252
+ end
253
+
254
+ def scrape_article(url)
255
+ # puts "Scraping article: " + url
256
+ doc = Nokogiri::HTML.parse(open(url))
257
+ doc.encoding = 'utf-8'
258
+ if(doc.css(".article .title").size() == 0)then
259
+ puts "Warning. No title. Ignoring: " + url
260
+ return
261
+ end
262
+ title = doc.css(".article .title").first.inner_html
263
+ introduction = ""
264
+ begin
265
+ introduction = doc.css(".article .abstract").first.inner_html
266
+ rescue
267
+ end
268
+
269
+ body = ""
270
+ doc.css(".article .text").each do |p|
271
+ body = body + "<p>" + p.inner_html + "</p>"
272
+ end
273
+
274
+ title = title.gsub('–','–') # Fjern bindestrek
275
+
276
+ uri = URI.parse(url)
277
+ path = uri.path
278
+
279
+ # Remove inline css
280
+ introduction = introduction.gsub(/style=\"[^\"]*\"/,"")
281
+ body = body.gsub(/style=\"[^\"]*\"/,"")
282
+
283
+ published_path = publish_article(path, title, introduction, body, doc)
284
+ # log = path + ";" + URI.parse(published_path).path
285
+ log = url + ";" + published_path + "\n"
286
+ File.open("scrape_holocaust.log", 'a') {|f| f.write(log) }
287
+ puts title + " => " + published_path
288
+ end
289
+
290
+ def http_status_code(url)
291
+ uri = URI.parse(url)
292
+ Net::HTTP.start(uri.host, uri.port) do |http|
293
+ return http.head(uri.request_uri).code.to_i
294
+ end
295
+ end
296
+
297
+ def scrape_article_listing(url)
298
+
299
+ if( http_status_code(url) == 404)then
300
+ puts "Advarsel: Status code 404: " + url
301
+ return
302
+ end
303
+
304
+ # Ad-hoc rule to ignore path
305
+ if(url =~ /\/kunnskapsbasen\/Presse/)then
306
+ puts "Advarsel: Ignorerer pressesidene har for store bilder for vortex: url"
307
+ return
308
+ end
309
+
310
+ doc = Nokogiri::HTML.parse(open(url))
311
+ doc.encoding = 'utf-8'
312
+
313
+ if(doc.css(".folder .list").size > 0 )then
314
+ puts "Scraping article listing page: " + url
315
+ doc.css(".folder .list .article").each do |article|
316
+ href = article.css(".title a").attr("href").text
317
+ begin
318
+ href = href.gsub("%20","")
319
+ rescue
320
+ end
321
+ scrape_article_listing(href)
322
+ end
323
+ else
324
+ # Pages without .folder .list is articles
325
+ puts "Scraping article: " + url
326
+ scrape_article(url)
327
+ end
328
+
329
+ end
330
+
331
+ # if @vortex.exists?('/konv/kunnskapsbasen/aktor') then
332
+ # @vortex.delete('/konv/kunnskapsbasen/aktor')
333
+ # end
334
+ @url = "http://www.hlsenteret.no/kunnskapsbasen/"
335
+ File.open("scrape_holocaust.log", 'w') {|f| f.write("") }
336
+ scrape_article_listing(@url)
337
+
338
+
339
+ # Denne har 4 bilder
340
+ # scrape_article("http://www.hlsenteret.no/kunnskapsbasen/folkemord/armenerne/1334")
341
+
342
+ # Denne har ingen tittel og skal ignoreres
343
+ # scrape_article('http://www.hlsenteret.no/kunnskapsbasen/tradisjoner/86')
344
+
345
+ # Denne returnerer 404
346
+ # scrape_article_listing("http://www.hlsenteret.no/kunnskapsbasen/tema/kunst")
347
+
348
+ # Denne har bilder som er for store vortex
349
+ # scrape_article_listing("http://www.hlsenteret.no/kunnskapsbasen/Presse/")
350
+
351
+ # scrape_article('http://www.hlsenteret.no/kunnskapsbasen/tema/relpol/10129')
@@ -0,0 +1,134 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'vortex_client'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'pry'
8
+ require 'net/http'
9
+ require 'pathname'
10
+ require 'json'
11
+ require 'pp'
12
+
13
+ # The method String.cleanpath seems to be missing in my version of ruby
14
+ # Since it is not important to us, we can define it:
15
+ class String
16
+ def cleanpath
17
+ return self
18
+ end
19
+ end
20
+
21
+ # Relativize url's
22
+ def relative_url(from,to)
23
+ pn = Pathname.new(to)
24
+ return pn.relative_path_from(from).to_s.gsub(/^\.\.\//,'')
25
+ end
26
+
27
+ # Read link database from file. Return hashmaps
28
+ def read_linkbase_from_file
29
+ old_pages = { }
30
+ File.open('scrape_holocaust.log').each do |line|
31
+ line = line.chop
32
+ pages = line.split(/;/)
33
+ if(pages[0])then
34
+ old_pages[pages[0]] = pages[1]
35
+ end
36
+ end
37
+ return old_pages
38
+ end
39
+
40
+ # Generate html content for right column box in vortex:
41
+ def generate_related_content(old_url,new_url)
42
+ related_content_html = ""
43
+ doc = Nokogiri::HTML.parse(open(old_url))
44
+ doc.encoding = 'utf-8'
45
+ doc.css(".related .title").each do |related_title|
46
+ related_content_html += "<p><b>#{related_title.text}</b></p>\n"
47
+ # puts " Title: '" + related_title.text + "'"
48
+ # # puts "Old url :'" + old_url + "'"
49
+ # # puts " url :'" + new_url + "'"
50
+ next_element = related_title.next_element
51
+ related_content_html += "<ul>\n"
52
+ while(next_element)
53
+ href = next_element.css("a").attr("href").to_s
54
+ text = next_element.css("a").text
55
+ link_to = href
56
+ if(@old_pages[href])
57
+ # puts " New: '" + @old_pages[href] + "'"
58
+ # puts " Relt:'" + relative_url(new_url,@old_pages[href])
59
+ link_to = relative_url(new_url,@old_pages[href])
60
+ else
61
+ # puts " Warning: Extern url: " + href
62
+ end
63
+ next_element = next_element.next_element
64
+ related_content_html += " <li><a href=\"#{link_to}\">#{text}</a>\n"
65
+ end
66
+ related_content_html += "</ul>\n\n"
67
+ end
68
+ return related_content_html
69
+ end
70
+
71
+ def update_links_in_tekst(html,new_url)
72
+ doc = Nokogiri::HTML.parse(html)
73
+ doc.css("a").each do |link|
74
+ if(link.attributes["href"])
75
+ href = link.attr("href")
76
+ if(@old_pages[href])then
77
+ link_to = relative_url(new_url,@old_pages[href])
78
+ puts " Replace link in body:" + link_to
79
+ html = html.gsub(href,link_to)
80
+ end
81
+ end
82
+ end
83
+ return html
84
+ end
85
+
86
+
87
+ # # Debugg code
88
+ # @vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
89
+ # @old_pages = read_linkbase_from_file
90
+ # old_url = "http://www.hlsenteret.no/kunnskapsbasen/tradisjoner/buddhisme/1049"
91
+ # new_url = "https://nyweb4-dav.uio.no/konv/kunnskapsbasen/tradisjoner/buddhisme/hellige-skrifter-i-buddhismen.html"
92
+
93
+ # # old_url = "http://www.hlsenteret.no/kunnskapsbasen/tema/religionsfrihet"
94
+ # # new_url = "https://nyweb4-dav.uio.no/konv/kunnskapsbasen/tema/religionsfrihet/religions-og-livssynsfrihet.html"
95
+
96
+ # src = @vortex.get(URI.parse(new_url).path)
97
+ # data = JSON.parse(src)
98
+ # data['properties']['hideAdditionalContent'] = "false"
99
+ # # data['properties']['related-content'] = related_content_html
100
+ # content = data['properties']['content']
101
+ # content = update_links_in_tekst(content,new_url)
102
+ # puts content
103
+ # exit
104
+
105
+ # update_links_in_tekst("http://www.hlsenteret.no/kunnskapsbasen/Holocaust_og_andre_folkemord",nil)
106
+ # exit
107
+
108
+
109
+ @vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
110
+
111
+ @old_pages = read_linkbase_from_file
112
+ count = 1
113
+ @old_pages.each do |old_url, new_url|
114
+ puts count
115
+ puts old_url
116
+ puts new_url
117
+ puts "Url: '" + URI.parse(new_url).path.to_s + "'"
118
+ related_content_html = generate_related_content(old_url,new_url)
119
+
120
+
121
+ src = @vortex.get(URI.parse(new_url).path)
122
+ data = JSON.parse(src)
123
+ data['properties']['hideAdditionalContent'] = "false"
124
+ data['properties']['related-content'] = related_content_html
125
+
126
+ content = data['properties']['content']
127
+ content = update_links_in_tekst(content,new_url)
128
+ data['properties']['content'] = content
129
+ @vortex.put_string(URI.parse(new_url).path, data.to_json)
130
+ puts "-------"
131
+
132
+ count += 1
133
+ # exit if count > 10
134
+ end