vortex_client 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,38 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'vortex_client'
5
+ require 'uri'
6
+ require 'nokogiri'
7
+ require 'htmlentities'
8
+ require 'json'
9
+ require 'iconv'
10
+
11
+ class MigrateHeroPublications
12
+ attr :vortex, :uri
13
+
14
+ def initialize(url)
15
+ @vortex = Vortex::Connection.new(url,:use_osx_keychain => true)
16
+ @uri = URI.parse(url)
17
+ end
18
+
19
+ def migrate_publications(url)
20
+ doc = Nokogiri::HTML.parse(open(url))
21
+ doc.encoding = 'utf-8'
22
+ doc.xpath("//td").each do |element|
23
+ if(element.inner_text =~ /\d*:\d*/)then
24
+ puts element.inner_text
25
+ puts "-------"
26
+ end
27
+ end
28
+ end
29
+
30
+ end
31
+
32
+ # Scrape all webpages found in src_url and store in dest_url
33
+ dest_url = 'https://nyweb1-dav.uio.no/konv/hero/publikasjoner'
34
+ # src_url = 'http://www.hero.uio.no/publicat/2003/'
35
+ # src_url = 'http://www.hero.uio.no/nyheter.html'
36
+ src_url = 'http://www.hero.uio.no/publications_all/publications10.html'
37
+ migration = MigrateHeroPublications.new(dest_url)
38
+ migration.migrate_publications(src_url)
@@ -0,0 +1,351 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'vortex_client'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'pry'
8
+ require "net/http"
9
+ require 'pathname'
10
+ require 'json'
11
+ require 'pp'
12
+
13
+ # Scrape content from the Norwegian Center for Studies of Holocaust and Religious Minorities'
14
+ # website http://www.hlsenteret.no/ and re-publish content to University of Oslo's CMS
15
+ # using the WebDAV API.
16
+
17
+ # TODO
18
+ # ok - Hente ut tittel på mappen
19
+ # ok - Hente ut content/type på bilder
20
+ # ok - Kopiere over ingress bilde
21
+ # ok - Luke ut rare tegn i overskriftene
22
+ # ok - Kjøre alle dokumenter uten kræsj
23
+ # ok - Kopiere over flere bilder
24
+ # ok - Få logging til fil til å fungere
25
+ # - Logg publisering til fil
26
+ # - Hvorfor konverteres og publiseres /konv/kunnskapsbasen/-a-hrefhttp-.html
27
+ # - Håndtere /konv/kunnskapsbasen/hl-senterets-kunnskapsbase.html spesielt?
28
+
29
+ @vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
30
+
31
+ # Simple logger
32
+ def log(str)
33
+ puts str
34
+ File.open("scrape_holocaust.log", 'a') do |f|
35
+ f.write( Time.now.iso8601 + ";" + str + "\n" )
36
+ end
37
+ end
38
+
39
+ def http_content_type(url)
40
+ uri = URI.parse(url)
41
+ http = Net::HTTP.new(uri.host, uri.port)
42
+
43
+ request = Net::HTTP::Get.new(uri.request_uri)
44
+ request["User-Agent"] = "My Ruby Script"
45
+ request["Accept"] = "*/*"
46
+
47
+ response = http.request(request)
48
+ return response['content-type']
49
+ end
50
+
51
+ def scrape_folder_title(url)
52
+ begin
53
+ doc = Nokogiri::HTML.parse(open(scrape_url))
54
+ doc.encoding = 'utf-8'
55
+
56
+ title = nil
57
+ begin
58
+ title = doc.css(".folder .title").first.inner_html
59
+ rescue
60
+ end
61
+ if(title == nil)then
62
+ title = doc.css(".article .title").first.inner_html
63
+ end
64
+ rescue
65
+ end
66
+ return title
67
+ end
68
+
69
+ def create_path(dest_path)
70
+ destination_path = "/"
71
+ dest_path.split("/").each do |folder|
72
+ if(folder != "")then
73
+ folder = folder.downcase
74
+ destination_path = destination_path + folder + "/"
75
+
76
+ if( not(@vortex.exists?(destination_path)) )then
77
+
78
+ puts "Creating folder " + destination_path
79
+
80
+ uri = URI.parse(@url)
81
+
82
+ title = scrape_folder_title( 'http://' + uri.host + destination_path.gsub('/konv','') )
83
+
84
+ if(title)then
85
+ title = title[0..0].upcase + title[1..title.length]
86
+ puts "Mappetittel: " + title.to_s
87
+ end
88
+
89
+ @vortex.mkdir(destination_path)
90
+ @vortex.proppatch(destination_path,'<v:collection-type xmlns:v="vrtx">article-listing</v:collection-type>')
91
+ if(title)then
92
+ @vortex.proppatch(destination_path,'<v:userTitle xmlns:v="vrtx">' + title.to_s + '</v:userTitle>')
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+
100
+ # Return an array images and captions
101
+ def scrape_images(doc)
102
+ images = []
103
+ i = 0
104
+ doc.css(".imageSeriesImage").each do |item|
105
+ url = item.attr("src")
106
+ caption = doc.css(".imageText")[i].text
107
+ images << { :url => url, :caption => caption }
108
+ i = i + 1
109
+ end
110
+ return images
111
+ end
112
+
113
+
114
+ # Resize an image using the unix command line utility 'sips' available on osx
115
+ def resize_image(content, content_type, size)
116
+ filename = "/tmp/" + (1 + rand(10000000)).to_s + "." + content_type
117
+ filename_resized = "/tmp/" + (1 + rand(10000000)).to_s + "_resized." + content_type
118
+ File.open(filename, 'w') do |f|
119
+ f.write( content)
120
+ end
121
+ result = %x[sips --resampleWidth #{size} #{filename} --out #{filename_resized}]
122
+ content_resized = IO.readlines(filename_resized,'r').to_s
123
+ return content_resized
124
+ end
125
+
126
+
127
+ # Download graphic to dest_path and return absolute filename
128
+ def download_image(src_url,dest_path)
129
+ src_url = src_url.gsub(/\?.*/,'')
130
+ content_type = http_content_type(src_url)
131
+ content_type = content_type.gsub("image/", "").gsub("jpeg","jpg")
132
+ content = open(src_url).read
133
+ basename = Pathname.new(src_url).basename.to_s.gsub(/\..*/,'')
134
+ vortex_url = dest_path + basename + "." + content_type
135
+ vortex_url = vortex_url.downcase
136
+ begin
137
+ @vortex.put_string(vortex_url, content)
138
+ rescue Exception => e
139
+ puts e.message
140
+ pp e.backtrace.inspect
141
+ puts "vortex_url: " + vortex_url
142
+ exit
143
+ end
144
+
145
+ # Store a resized image to vortex
146
+ puts "Nedskalerer bilde: " + src_url
147
+ content_resized = resize_image(content, content_type,300)
148
+ vortex_url_resized = dest_path + basename + "_width_300." + content_type
149
+
150
+ vortex_url_resized = vortex_url_resized.downcase
151
+ begin
152
+ @vortex.put_string(vortex_url_resized, content_resized)
153
+ rescue Exception => e
154
+ puts e.message
155
+ pp e.backtrace.inspect
156
+ puts "vortex_url_resized: " + vortex_url_resized
157
+ exit
158
+ end
159
+
160
+ return { :vortex_url_resized => vortex_url_resized, :vortex_url => vortex_url}
161
+ end
162
+
163
+
164
+ def publish_article(path, title, introduction, body, doc)
165
+ dest_path = "/konv" + path.gsub(/\d*$/,'')
166
+
167
+ if(not(dest_path =~ /\/$/))then
168
+ dest_path = dest_path + "/"
169
+ end
170
+
171
+ puts "Publishing to: '#{dest_path}'"
172
+
173
+ create_path(dest_path)
174
+ @vortex.cd(dest_path)
175
+
176
+ # images = []
177
+ # Comment out this line to prevent scraper from downloading images
178
+ images = scrape_images(doc)
179
+
180
+ images.each do |image|
181
+ filenames = download_image(image[:url], dest_path)
182
+ image[:vortex_url] = filenames[:vortex_url_resized]
183
+ image[:vortex_url_org] = filenames[:vortex_url]
184
+ end
185
+
186
+ url = dest_path + Vortex::StringUtils.create_filename(title.gsub(":","_")).gsub(/-$/,'') + '.html'
187
+ url = url.downcase
188
+
189
+ attributes = {:title => title,
190
+ :introduction => introduction,
191
+ :body => body,
192
+ :publishedDate => Time.now,
193
+ :url => url}
194
+
195
+ if(images and images.first)then
196
+ attributes[:picture] = images.first[:vortex_url]
197
+ end
198
+
199
+ article = Vortex::StructuredArticle.new(attributes )
200
+ begin
201
+ url = @vortex.publish(article)
202
+ rescue Exception => e
203
+ puts e.message
204
+ require 'pp'
205
+ pp e.backtrace.inspect
206
+ pp attributes
207
+ puts "Path: " + dest_path
208
+ exit
209
+ end
210
+
211
+ # Add additional images to bottom of page
212
+ images_html = ""
213
+ if(images and images.size > 1)then
214
+ images[1..images.size].each do |image|
215
+ image_html = <<EOF
216
+ <p>
217
+ <div class="vrtx-introduction-image" style="width: 300px; ">
218
+ <a title="Last ned bilde i full størrelse" href="#{image[:vortex_url_org]}">
219
+ <img src="#{image[:vortex_url]}" style="width: 300px;" />
220
+ </a>
221
+ <div class="vrtx-imagetext">
222
+ <div class="vrtx-imagedescription">
223
+ #{image[:caption]}
224
+ </div>
225
+ </div>
226
+ </div>
227
+ </p>
228
+ EOF
229
+ images_html = images_html + image_html
230
+ end
231
+ end
232
+
233
+ # Reopen document and set caption on article image
234
+ if(images and images.first)then
235
+ @vortex.find(url) do |item|
236
+ data = JSON.parse(item.content)
237
+
238
+ caption = ""
239
+ if(images.first[:caption])then
240
+ caption = images.first[:caption]
241
+ end
242
+ caption = caption + " <a href=\"#{images.first[:vortex_url_org]}\">Last ned i full størrelse</a>"
243
+ data["properties"]["caption"] = caption
244
+
245
+ # Add addiitional images at bottom
246
+ data["properties"]["content"] = data["properties"]["content"] + images_html
247
+ item.content = data.to_json
248
+ end
249
+ end
250
+
251
+ return url
252
+ end
253
+
254
+ def scrape_article(url)
255
+ # puts "Scraping article: " + url
256
+ doc = Nokogiri::HTML.parse(open(url))
257
+ doc.encoding = 'utf-8'
258
+ if(doc.css(".article .title").size() == 0)then
259
+ puts "Warning. No title. Ignoring: " + url
260
+ return
261
+ end
262
+ title = doc.css(".article .title").first.inner_html
263
+ introduction = ""
264
+ begin
265
+ introduction = doc.css(".article .abstract").first.inner_html
266
+ rescue
267
+ end
268
+
269
+ body = ""
270
+ doc.css(".article .text").each do |p|
271
+ body = body + "<p>" + p.inner_html + "</p>"
272
+ end
273
+
274
+ title = title.gsub('–','–') # Fjern bindestrek
275
+
276
+ uri = URI.parse(url)
277
+ path = uri.path
278
+
279
+ # Remove inline css
280
+ introduction = introduction.gsub(/style=\"[^\"]*\"/,"")
281
+ body = body.gsub(/style=\"[^\"]*\"/,"")
282
+
283
+ published_path = publish_article(path, title, introduction, body, doc)
284
+ # log = path + ";" + URI.parse(published_path).path
285
+ log = url + ";" + published_path + "\n"
286
+ File.open("scrape_holocaust.log", 'a') {|f| f.write(log) }
287
+ puts title + " => " + published_path
288
+ end
289
+
290
+ def http_status_code(url)
291
+ uri = URI.parse(url)
292
+ Net::HTTP.start(uri.host, uri.port) do |http|
293
+ return http.head(uri.request_uri).code.to_i
294
+ end
295
+ end
296
+
297
+ def scrape_article_listing(url)
298
+
299
+ if( http_status_code(url) == 404)then
300
+ puts "Advarsel: Status code 404: " + url
301
+ return
302
+ end
303
+
304
+ # Ad-hoc rule to ignore path
305
+ if(url =~ /\/kunnskapsbasen\/Presse/)then
306
+ puts "Advarsel: Ignorerer pressesidene har for store bilder for vortex: url"
307
+ return
308
+ end
309
+
310
+ doc = Nokogiri::HTML.parse(open(url))
311
+ doc.encoding = 'utf-8'
312
+
313
+ if(doc.css(".folder .list").size > 0 )then
314
+ puts "Scraping article listing page: " + url
315
+ doc.css(".folder .list .article").each do |article|
316
+ href = article.css(".title a").attr("href").text
317
+ begin
318
+ href = href.gsub("%20","")
319
+ rescue
320
+ end
321
+ scrape_article_listing(href)
322
+ end
323
+ else
324
+ # Pages without .folder .list is articles
325
+ puts "Scraping article: " + url
326
+ scrape_article(url)
327
+ end
328
+
329
+ end
330
+
331
+ # if @vortex.exists?('/konv/kunnskapsbasen/aktor') then
332
+ # @vortex.delete('/konv/kunnskapsbasen/aktor')
333
+ # end
334
+ @url = "http://www.hlsenteret.no/kunnskapsbasen/"
335
+ File.open("scrape_holocaust.log", 'w') {|f| f.write("") }
336
+ scrape_article_listing(@url)
337
+
338
+
339
+ # Denne har 4 bilder
340
+ # scrape_article("http://www.hlsenteret.no/kunnskapsbasen/folkemord/armenerne/1334")
341
+
342
+ # Denne har ingen tittel og skal ignoreres
343
+ # scrape_article('http://www.hlsenteret.no/kunnskapsbasen/tradisjoner/86')
344
+
345
+ # Denne returnerer 404
346
+ # scrape_article_listing("http://www.hlsenteret.no/kunnskapsbasen/tema/kunst")
347
+
348
+ # Denne har bilder som er for store vortex
349
+ # scrape_article_listing("http://www.hlsenteret.no/kunnskapsbasen/Presse/")
350
+
351
+ # scrape_article('http://www.hlsenteret.no/kunnskapsbasen/tema/relpol/10129')
@@ -0,0 +1,134 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'vortex_client'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'uri'
7
+ require 'pry'
8
+ require 'net/http'
9
+ require 'pathname'
10
+ require 'json'
11
+ require 'pp'
12
+
13
+ # The method String.cleanpath seems to be missing in my version of ruby
14
+ # Since it is not important to us, we can define it:
15
+ class String
16
+ def cleanpath
17
+ return self
18
+ end
19
+ end
20
+
21
+ # Relativize url's
22
+ def relative_url(from,to)
23
+ pn = Pathname.new(to)
24
+ return pn.relative_path_from(from).to_s.gsub(/^\.\.\//,'')
25
+ end
26
+
27
+ # Read link database from file. Return hashmaps
28
+ def read_linkbase_from_file
29
+ old_pages = { }
30
+ File.open('scrape_holocaust.log').each do |line|
31
+ line = line.chop
32
+ pages = line.split(/;/)
33
+ if(pages[0])then
34
+ old_pages[pages[0]] = pages[1]
35
+ end
36
+ end
37
+ return old_pages
38
+ end
39
+
40
+ # Generate html content for right column box in vortex:
41
+ def generate_related_content(old_url,new_url)
42
+ related_content_html = ""
43
+ doc = Nokogiri::HTML.parse(open(old_url))
44
+ doc.encoding = 'utf-8'
45
+ doc.css(".related .title").each do |related_title|
46
+ related_content_html += "<p><b>#{related_title.text}</b></p>\n"
47
+ # puts " Title: '" + related_title.text + "'"
48
+ # # puts "Old url :'" + old_url + "'"
49
+ # # puts " url :'" + new_url + "'"
50
+ next_element = related_title.next_element
51
+ related_content_html += "<ul>\n"
52
+ while(next_element)
53
+ href = next_element.css("a").attr("href").to_s
54
+ text = next_element.css("a").text
55
+ link_to = href
56
+ if(@old_pages[href])
57
+ # puts " New: '" + @old_pages[href] + "'"
58
+ # puts " Relt:'" + relative_url(new_url,@old_pages[href])
59
+ link_to = relative_url(new_url,@old_pages[href])
60
+ else
61
+ # puts " Warning: Extern url: " + href
62
+ end
63
+ next_element = next_element.next_element
64
+ related_content_html += " <li><a href=\"#{link_to}\">#{text}</a>\n"
65
+ end
66
+ related_content_html += "</ul>\n\n"
67
+ end
68
+ return related_content_html
69
+ end
70
+
71
+ def update_links_in_tekst(html,new_url)
72
+ doc = Nokogiri::HTML.parse(html)
73
+ doc.css("a").each do |link|
74
+ if(link.attributes["href"])
75
+ href = link.attr("href")
76
+ if(@old_pages[href])then
77
+ link_to = relative_url(new_url,@old_pages[href])
78
+ puts " Replace link in body:" + link_to
79
+ html = html.gsub(href,link_to)
80
+ end
81
+ end
82
+ end
83
+ return html
84
+ end
85
+
86
+
87
+ # # Debugg code
88
+ # @vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
89
+ # @old_pages = read_linkbase_from_file
90
+ # old_url = "http://www.hlsenteret.no/kunnskapsbasen/tradisjoner/buddhisme/1049"
91
+ # new_url = "https://nyweb4-dav.uio.no/konv/kunnskapsbasen/tradisjoner/buddhisme/hellige-skrifter-i-buddhismen.html"
92
+
93
+ # # old_url = "http://www.hlsenteret.no/kunnskapsbasen/tema/religionsfrihet"
94
+ # # new_url = "https://nyweb4-dav.uio.no/konv/kunnskapsbasen/tema/religionsfrihet/religions-og-livssynsfrihet.html"
95
+
96
+ # src = @vortex.get(URI.parse(new_url).path)
97
+ # data = JSON.parse(src)
98
+ # data['properties']['hideAdditionalContent'] = "false"
99
+ # # data['properties']['related-content'] = related_content_html
100
+ # content = data['properties']['content']
101
+ # content = update_links_in_tekst(content,new_url)
102
+ # puts content
103
+ # exit
104
+
105
+ # update_links_in_tekst("http://www.hlsenteret.no/kunnskapsbasen/Holocaust_og_andre_folkemord",nil)
106
+ # exit
107
+
108
+
109
+ @vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
110
+
111
+ @old_pages = read_linkbase_from_file
112
+ count = 1
113
+ @old_pages.each do |old_url, new_url|
114
+ puts count
115
+ puts old_url
116
+ puts new_url
117
+ puts "Url: '" + URI.parse(new_url).path.to_s + "'"
118
+ related_content_html = generate_related_content(old_url,new_url)
119
+
120
+
121
+ src = @vortex.get(URI.parse(new_url).path)
122
+ data = JSON.parse(src)
123
+ data['properties']['hideAdditionalContent'] = "false"
124
+ data['properties']['related-content'] = related_content_html
125
+
126
+ content = data['properties']['content']
127
+ content = update_links_in_tekst(content,new_url)
128
+ data['properties']['content'] = content
129
+ @vortex.put_string(URI.parse(new_url).path, data.to_json)
130
+ puts "-------"
131
+
132
+ count += 1
133
+ # exit if count > 10
134
+ end