RubyGems - vortex_client - Versions diffs - 0.6.0 → 0.7.0 - Mend

vortex_client 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/VERSION +1 -1
data/examples/README.rdoc +30 -0
data/examples/create_personpresentations.rb +147 -0
data/examples/dice.gif +0 -0
data/examples/import_static_site.rb +211 -0
data/examples/no_right_margin.rb +11 -0
data/examples/propfind_proppatch.rb +7 -0
data/examples/replace_spike.rb +42 -0
data/examples/scrape_hero_publications.rb +38 -0
data/examples/scrape_holocaust.rb +351 -0
data/examples/scrape_holocaust_related_links.rb +134 -0
data/examples/scrape_vortex_search.rb +44 -0
data/examples/search_replace_documents.rb +57 -0
data/examples/search_replace_documents_frontpage.rb +88 -0
data/examples/test_searc_replace.rb +3 -0
data/examples/unpublish.rb +11 -0
data/examples/upload_image.rb +8 -0
data/examples/upload_image_flymake.rb +8 -0
data/lib/vortex_client.rb +2 -1
data/test/test_vortex_article_publish.rb +3 -16
metadata +48 -17
data/.gitignore +0 -21

data/examples/scrape_hero_publications.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+require 'rubygems'
+require 'open-uri'
+require 'vortex_client'
+require 'uri'
+require 'nokogiri'
+require 'htmlentities'
+require 'json'
+require 'iconv'
+class MigrateHeroPublications
+  attr :vortex, :uri
+  def initialize(url)
+    @vortex = Vortex::Connection.new(url,:use_osx_keychain => true)
+    @uri = URI.parse(url)
+  end
+  def migrate_publications(url)
+    doc = Nokogiri::HTML.parse(open(url))
+    doc.encoding = 'utf-8'
+    doc.xpath("//td").each do |element|
+      if(element.inner_text =~ /\d*:\d*/)then
+        puts element.inner_text
+        puts "-------"
+      end
+    end
+  end
+end
+# Scrape all webpages found in src_url and store in dest_url
+dest_url = 'https://nyweb1-dav.uio.no/konv/hero/publikasjoner'
+# src_url = 'http://www.hero.uio.no/publicat/2003/'
+# src_url = 'http://www.hero.uio.no/nyheter.html'
+src_url = 'http://www.hero.uio.no/publications_all/publications10.html'
+migration = MigrateHeroPublications.new(dest_url)
+migration.migrate_publications(src_url)

data/examples/scrape_holocaust.rb ADDED Viewed

@@ -0,0 +1,351 @@
+# -*- coding: utf-8 -*-
+require 'rubygems'
+require 'vortex_client'
+require 'nokogiri'
+require 'open-uri'
+require 'uri'
+require 'pry'
+require "net/http"
+require 'pathname'
+require 'json'
+require 'pp'
+# Scrape content from the Norwegian Center for Studies of Holocaust and Religious Minorities'
+# website http://www.hlsenteret.no/ and re-publish content to University of Oslo's CMS
+# using the WebDAV API.
+# TODO
+#  ok - Hente ut tittel på mappen
+#  ok - Hente ut content/type på bilder
+#  ok - Kopiere over ingress bilde
+#  ok - Luke ut rare tegn i overskriftene
+#  ok - Kjøre alle dokumenter uten kræsj
+#  ok - Kopiere over flere bilder
+#  ok - Få logging til fil til å fungere
+# - Logg publisering til fil
+# - Hvorfor konverteres og publiseres /konv/kunnskapsbasen/-a-hrefhttp-.html
+# - Håndtere /konv/kunnskapsbasen/hl-senterets-kunnskapsbase.html spesielt?
+@vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
+# Simple logger
+def log(str)
+  puts str
+  File.open("scrape_holocaust.log", 'a') do |f|
+    f.write( Time.now.iso8601 + ";" + str + "\n" )
+  end
+end
+def http_content_type(url)
+  uri = URI.parse(url)
+  http = Net::HTTP.new(uri.host, uri.port)
+  request = Net::HTTP::Get.new(uri.request_uri)
+  request["User-Agent"] = "My Ruby Script"
+  request["Accept"] = "*/*"
+  response = http.request(request)
+  return response['content-type']
+end
+def scrape_folder_title(url)
+  begin
+    doc = Nokogiri::HTML.parse(open(scrape_url))
+    doc.encoding = 'utf-8'
+    title = nil
+    begin
+      title = doc.css(".folder .title").first.inner_html
+    rescue
+    end
+    if(title == nil)then
+      title = doc.css(".article .title").first.inner_html
+    end
+  rescue
+  end
+  return title
+end
+def create_path(dest_path)
+  destination_path = "/"
+  dest_path.split("/").each do |folder|
+    if(folder != "")then
+      folder = folder.downcase
+      destination_path = destination_path + folder + "/"
+      if( not(@vortex.exists?(destination_path)) )then
+        puts "Creating folder " + destination_path
+        uri = URI.parse(@url)
+        title = scrape_folder_title( 'http://' + uri.host + destination_path.gsub('/konv','') )
+        if(title)then
+          title = title[0..0].upcase + title[1..title.length]
+          puts "Mappetittel: " + title.to_s
+        end
+        @vortex.mkdir(destination_path)
+        @vortex.proppatch(destination_path,'<v:collection-type xmlns:v="vrtx">article-listing</v:collection-type>')
+        if(title)then
+          @vortex.proppatch(destination_path,'<v:userTitle xmlns:v="vrtx">' + title.to_s +  '</v:userTitle>')
+        end
+      end
+    end
+  end
+end
+# Return an array images and captions
+def scrape_images(doc)
+  images = []
+  i = 0
+  doc.css(".imageSeriesImage").each do |item|
+    url = item.attr("src")
+    caption = doc.css(".imageText")[i].text
+    images << { :url => url, :caption => caption }
+    i = i + 1
+  end
+  return images
+end
+# Resize an image using the unix command line utility 'sips' available on osx
+def resize_image(content, content_type, size)
+  filename = "/tmp/" + (1 + rand(10000000)).to_s + "." + content_type
+  filename_resized = "/tmp/" + (1 + rand(10000000)).to_s + "_resized." + content_type
+  File.open(filename, 'w') do |f|
+    f.write( content)
+  end
+  result = %x[sips --resampleWidth #{size} #{filename} --out #{filename_resized}]
+  content_resized = IO.readlines(filename_resized,'r').to_s
+  return content_resized
+end
+# Download graphic to dest_path and return absolute filename
+def download_image(src_url,dest_path)
+  src_url = src_url.gsub(/\?.*/,'')
+  content_type = http_content_type(src_url)
+  content_type = content_type.gsub("image/", "").gsub("jpeg","jpg")
+  content = open(src_url).read
+  basename = Pathname.new(src_url).basename.to_s.gsub(/\..*/,'')
+  vortex_url = dest_path + basename + "." + content_type
+  vortex_url = vortex_url.downcase
+  begin
+    @vortex.put_string(vortex_url, content)
+  rescue Exception => e
+    puts e.message
+    pp e.backtrace.inspect
+    puts "vortex_url: " + vortex_url
+    exit
+  end
+  # Store a resized image to vortex
+  puts "Nedskalerer bilde: " + src_url
+  content_resized = resize_image(content, content_type,300)
+  vortex_url_resized = dest_path + basename + "_width_300." + content_type
+  vortex_url_resized = vortex_url_resized.downcase
+  begin
+    @vortex.put_string(vortex_url_resized, content_resized)
+  rescue Exception => e
+    puts e.message
+    pp e.backtrace.inspect
+    puts "vortex_url_resized: " + vortex_url_resized
+    exit
+  end
+  return { :vortex_url_resized => vortex_url_resized, :vortex_url => vortex_url}
+end
+def publish_article(path, title, introduction, body, doc)
+  dest_path = "/konv" + path.gsub(/\d*$/,'')
+  if(not(dest_path =~ /\/$/))then
+    dest_path = dest_path + "/"
+  end
+  puts "Publishing to: '#{dest_path}'"
+  create_path(dest_path)
+  @vortex.cd(dest_path)
+  # images = []
+  # Comment out this line to prevent scraper from downloading images
+  images = scrape_images(doc)
+  images.each do |image|
+    filenames = download_image(image[:url], dest_path)
+    image[:vortex_url] = filenames[:vortex_url_resized]
+    image[:vortex_url_org] = filenames[:vortex_url]
+  end
+  url = dest_path + Vortex::StringUtils.create_filename(title.gsub(":","_")).gsub(/-$/,'') + '.html'
+  url = url.downcase
+  attributes = {:title => title,
+    :introduction => introduction,
+    :body => body,
+    :publishedDate => Time.now,
+    :url => url}
+  if(images and images.first)then
+    attributes[:picture] = images.first[:vortex_url]
+  end
+  article  = Vortex::StructuredArticle.new(attributes )
+  begin
+    url = @vortex.publish(article)
+  rescue Exception => e
+    puts e.message
+    require 'pp'
+    pp e.backtrace.inspect
+    pp attributes
+    puts "Path: " + dest_path
+    exit
+  end
+  # Add additional images to bottom of page
+  images_html = ""
+  if(images and images.size > 1)then
+    images[1..images.size].each do |image|
+      image_html = <<EOF
+        <p>
+          <div class="vrtx-introduction-image" style="width: 300px; ">
+            <a title="Last ned bilde i full størrelse" href="#{image[:vortex_url_org]}">
+              <img src="#{image[:vortex_url]}" style="width: 300px;" />
+            </a>
+            <div class="vrtx-imagetext">
+              <div class="vrtx-imagedescription">
+                #{image[:caption]}
+              </div>
+            </div>
+          </div>
+        </p>
+EOF
+      images_html = images_html + image_html
+    end
+  end
+  # Reopen document and set caption on article image
+  if(images and images.first)then
+    @vortex.find(url) do |item|
+      data = JSON.parse(item.content)
+      caption = ""
+      if(images.first[:caption])then
+        caption = images.first[:caption]
+      end
+      caption = caption + " <a href=\"#{images.first[:vortex_url_org]}\">Last ned i full størrelse</a>"
+      data["properties"]["caption"] = caption
+      # Add addiitional images at bottom
+      data["properties"]["content"] = data["properties"]["content"] + images_html
+      item.content = data.to_json
+    end
+  end
+  return url
+end
+def scrape_article(url)
+  # puts "Scraping article: " + url
+  doc = Nokogiri::HTML.parse(open(url))
+  doc.encoding = 'utf-8'
+  if(doc.css(".article .title").size() == 0)then
+    puts "Warning. No title. Ignoring: " + url
+    return
+  end
+  title = doc.css(".article .title").first.inner_html
+  introduction = ""
+  begin
+    introduction = doc.css(".article .abstract").first.inner_html
+  rescue
+  end
+  body = ""
+  doc.css(".article .text").each do |p|
+    body = body + "<p>" + p.inner_html + "</p>"
+  end
+  title = title.gsub('','–') # Fjern bindestrek
+  uri = URI.parse(url)
+  path = uri.path
+  # Remove inline css
+  introduction = introduction.gsub(/style=\"[^\"]*\"/,"")
+  body = body.gsub(/style=\"[^\"]*\"/,"")
+  published_path = publish_article(path, title, introduction, body, doc)
+  # log = path + ";" + URI.parse(published_path).path
+  log = url + ";" + published_path + "\n"
+  File.open("scrape_holocaust.log", 'a') {|f| f.write(log) }
+  puts title + " => " + published_path
+end
+def http_status_code(url)
+  uri = URI.parse(url)
+  Net::HTTP.start(uri.host, uri.port) do |http|
+    return http.head(uri.request_uri).code.to_i
+  end
+end
+def scrape_article_listing(url)
+  if( http_status_code(url) == 404)then
+    puts "Advarsel: Status code 404: " + url
+    return
+  end
+  # Ad-hoc rule to ignore path
+  if(url =~ /\/kunnskapsbasen\/Presse/)then
+    puts "Advarsel: Ignorerer pressesidene har for store bilder for vortex: url"
+    return
+  end
+  doc = Nokogiri::HTML.parse(open(url))
+  doc.encoding = 'utf-8'
+  if(doc.css(".folder .list").size > 0 )then
+    puts "Scraping article listing page: " + url
+    doc.css(".folder .list .article").each do |article|
+      href = article.css(".title a").attr("href").text
+      begin
+        href = href.gsub("%20","")
+      rescue
+      end
+      scrape_article_listing(href)
+    end
+  else
+    # Pages without .folder .list is articles
+    puts "Scraping article: " + url
+    scrape_article(url)
+  end
+end
+# if @vortex.exists?('/konv/kunnskapsbasen/aktor') then
+#  @vortex.delete('/konv/kunnskapsbasen/aktor')
+# end
+@url = "http://www.hlsenteret.no/kunnskapsbasen/"
+File.open("scrape_holocaust.log", 'w') {|f| f.write("") }
+scrape_article_listing(@url)
+# Denne har 4 bilder
+# scrape_article("http://www.hlsenteret.no/kunnskapsbasen/folkemord/armenerne/1334")
+# Denne har ingen tittel og skal ignoreres
+# scrape_article('http://www.hlsenteret.no/kunnskapsbasen/tradisjoner/86')
+# Denne returnerer 404
+# scrape_article_listing("http://www.hlsenteret.no/kunnskapsbasen/tema/kunst")
+# Denne har bilder som er for store vortex
+# scrape_article_listing("http://www.hlsenteret.no/kunnskapsbasen/Presse/")
+# scrape_article('http://www.hlsenteret.no/kunnskapsbasen/tema/relpol/10129')

data/examples/scrape_holocaust_related_links.rb ADDED Viewed

@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+require 'rubygems'
+require 'vortex_client'
+require 'nokogiri'
+require 'open-uri'
+require 'uri'
+require 'pry'
+require 'net/http'
+require 'pathname'
+require 'json'
+require 'pp'
+# The method String.cleanpath seems to be missing in my version of ruby
+# Since it is not important to us, we can define it:
+class String
+  def cleanpath
+    return self
+  end
+end
+# Relativize url's
+def relative_url(from,to)
+  pn = Pathname.new(to)
+  return  pn.relative_path_from(from).to_s.gsub(/^\.\.\//,'')
+end
+# Read link database from file. Return hashmaps
+def read_linkbase_from_file
+  old_pages = { }
+  File.open('scrape_holocaust.log').each do |line|
+    line = line.chop
+    pages = line.split(/;/)
+    if(pages[0])then
+      old_pages[pages[0]] = pages[1]
+    end
+  end
+  return old_pages
+end
+# Generate html content for right column box in vortex:
+def generate_related_content(old_url,new_url)
+  related_content_html = ""
+  doc = Nokogiri::HTML.parse(open(old_url))
+  doc.encoding = 'utf-8'
+  doc.css(".related .title").each do |related_title|
+    related_content_html += "<p><b>#{related_title.text}</b></p>\n"
+    # puts "  Title:   '" + related_title.text + "'"
+    # # puts "Old url :'" + old_url + "'"
+    # #  puts "    url :'" + new_url + "'"
+    next_element = related_title.next_element
+    related_content_html += "<ul>\n"
+    while(next_element)
+      href = next_element.css("a").attr("href").to_s
+      text = next_element.css("a").text
+      link_to = href
+      if(@old_pages[href])
+        # puts " New: '" + @old_pages[href] + "'"
+        # puts " Relt:'" + relative_url(new_url,@old_pages[href])
+        link_to = relative_url(new_url,@old_pages[href])
+      else
+        # puts "    Warning: Extern url: " + href
+      end
+      next_element = next_element.next_element
+      related_content_html += " <li><a href=\"#{link_to}\">#{text}</a>\n"
+    end
+    related_content_html += "</ul>\n\n"
+  end
+  return related_content_html
+end
+def update_links_in_tekst(html,new_url)
+  doc = Nokogiri::HTML.parse(html)
+  doc.css("a").each do |link|
+    if(link.attributes["href"])
+      href = link.attr("href")
+      if(@old_pages[href])then
+        link_to = relative_url(new_url,@old_pages[href])
+        puts " Replace link in body:" + link_to
+        html = html.gsub(href,link_to)
+      end
+    end
+  end
+  return html
+end
+# # Debugg code
+# @vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
+# @old_pages = read_linkbase_from_file
+# old_url = "http://www.hlsenteret.no/kunnskapsbasen/tradisjoner/buddhisme/1049"
+# new_url = "https://nyweb4-dav.uio.no/konv/kunnskapsbasen/tradisjoner/buddhisme/hellige-skrifter-i-buddhismen.html"
+# # old_url = "http://www.hlsenteret.no/kunnskapsbasen/tema/religionsfrihet"
+# # new_url = "https://nyweb4-dav.uio.no/konv/kunnskapsbasen/tema/religionsfrihet/religions-og-livssynsfrihet.html"
+# src = @vortex.get(URI.parse(new_url).path)
+# data = JSON.parse(src)
+# data['properties']['hideAdditionalContent'] = "false"
+# # data['properties']['related-content'] = related_content_html
+# content = data['properties']['content']
+# content = update_links_in_tekst(content,new_url)
+# puts content
+# exit
+# update_links_in_tekst("http://www.hlsenteret.no/kunnskapsbasen/Holocaust_og_andre_folkemord",nil)
+# exit
+@vortex = Vortex::Connection.new("https://nyweb4-dav.uio.no", :use_osx_keychain => true)
+@old_pages = read_linkbase_from_file
+count = 1
+@old_pages.each do |old_url, new_url|
+  puts count
+  puts old_url
+  puts new_url
+  puts "Url: '" + URI.parse(new_url).path.to_s + "'"
+  related_content_html = generate_related_content(old_url,new_url)
+  src = @vortex.get(URI.parse(new_url).path)
+  data = JSON.parse(src)
+  data['properties']['hideAdditionalContent'] = "false"
+  data['properties']['related-content'] = related_content_html
+  content = data['properties']['content']
+  content = update_links_in_tekst(content,new_url)
+  data['properties']['content'] = content
+  @vortex.put_string(URI.parse(new_url).path, data.to_json)
+  puts "-------"
+  count += 1
+  # exit if count > 10
+end