RubyGems - rdf-sak - Versions diffs - 0.1.5 - Mend

rdf-sak 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

data/lib/rdf/sak/urlrunner.rb ADDED

@@ -0,0 +1,602 @@
+require 'net/http'
+require 'concurrent'
+require 'concurrent-edge'
+require 'uri'
+require 'uri/urn/uuid'
+require 'time'
+require 'rdf'
+require 'rdf/rdfa'
+#require 'rdf/vocab'
+require 'rdf/vocab/dc'
+require 'rdf/vocab/prov'
+require 'rdf/sak/ci'
+require 'rdf/sak/tfo'
+require 'tidy_ffi'
+require 'uuidtools'
+require 'nokogiri'
+require 'crass'
+class RDF::SAK::URLRunner
+  private
+  UA = 'RDF::SAK::URLRunner/0.1'.freeze
+  TIDY_OPTS = {
+    wrap:             0,
+    numeric_entities: true,
+    tidy_mark:        false,
+    output_xhtml:     true,
+    custom_tags:      'inline',
+  }.freeze
+  TIDY_U  = RDF::URI('urn:x-dummy:tidy').freeze
+  XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
+  TTL     = "normalize-space(/html:html/html:head/html:title" \
+    "[normalize-space(.) != ''][1])".freeze
+  JLD     = "//html:script[text()]" \
+    "[normalize-space(@type) = 'application/ld+json']".freeze
+  QF      = /^([^?#]*)(?:\?([^#]*))?(?:#(.*?))?$/.freeze
+  SF      = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,;=._~-]/.freeze
+  TOKEN   = /^([^\x0-\x20()<>@,;:\\"\/\[\]?=\x7f-\xff]+)$/n.freeze
+  LANG_RE = /^[A-Za-z]+(?:[_-]+[0-9A-Za-z]+)*$/.freeze
+  # xpath for various linkages
+  XPATHNS  = {
+    html:  XHTMLNS,
+    svg:   'http://www.w3.org/2000/svg',
+    atom:  'http://www.w3.org/2005/Atom',
+    xlink: 'http://www.w3.org/1999/xlink',
+  }
+  XBASE_XP = 'ancestor-or-self::*[@xml:base][1]/@xml:base'.freeze
+  HTML_XP  = %w[*[not(self::html:base)][@href]/@href
+              *[@src]/@src object/@data *[@srcset]/@srcset
+              form/@action].map { |e| '//html:%s' % e }.join(?|).freeze
+  ATOM_XP  = %w[uri content/@src category/@scheme generator/@uri icon id
+              link/@href logo].map { |e| '//atom:%s' % e }.join(?|).freeze
+  RSS_XP   = %w[image docs source/@url enclosure/@url
+              guid comments].map { |e| '//%s' % e }.join(?|).freeze
+  XLINK_XP = '//*/@xlink:href'.freeze
+  def uri_pp uri
+    m = QF.match uri.to_s
+    out = m[1]
+    [[2, ??], [3, ?#]].each do |i, c|
+      next if m[i].nil?
+      clean = m[i].gsub(SF) { |s| sprintf('%X', s.ord) }
+      out += c + clean
+    end
+    out
+  end
+  def css_urls css
+    out = []
+    case css
+    when Array
+      css.each { |c| out += find_urls(c) }
+    when Hash
+      if css[:node] == :url
+        out << css[:value]
+      else
+        out += find_urls(css.values)
+      end
+    end
+    out.uniq.compact
+  end
+  def title_term title
+    title = title.first if title.is_a? Array
+    return unless title
+    text = title.content.strip
+    unless text.empty?
+      lang = if title.lang and LANG_RE.match? title.lang.strip
+               title.lang.strip.downcase.tr_s '_-', ?-
+             end
+      return RDF::Literal(text, language: lang)
+    end
+  end
+  # sponge cartridges should add triples and return all the resolved
+  # (RDF::)URIs to the caller
+  SPONGE = {
+    'application/xhtml+xml': -> content, uri {
+      rs = RDF::URI(uri.to_s)
+      # lol we have done this a million times before
+      if title = content.xpath('/html:html/html:head/html:title', XPATHNS)
+        if title = title_term(title)
+          @repo << [rs, RDF::Vocab::DC.title, title]
+        end
+      end
+      content.xpath(HTML_XP, XPATHNS).map do |node|
+        # html will always be an attribute
+        pred = case node.parent.name
+               when 'script' then RDF::Vocab::DC.requires
+               when 'a', 'link', 'area', 'form' then RDF::Vocab::DC.references
+               else
+                 RDF::Vocab::DC.hasPart
+               end
+        # srcset gets special treatment
+        objs = if node.name == 'srcset'
+                 node.content.strip.split(/\s*,\s+/).map do |u|
+                   u.split.first.strip
+                 end
+               else
+                 [node.content.strip]
+               end.map { |u| RDF::URI((uri + u).to_s) }
+        # phew
+        objs.each { |o| @repo << [rs, pred, o] }
+      end.flatten # don't forget to flatten
+    },
+    'application/atom+xml':  -> content, uri {
+      rs = RDF::URI(uri.to_s)
+      if title = content.xpath('/atom:feed/atom:title', XPATHNS)
+        if title = title_term(title)
+          @repo << [rs, RDF::Vocab::DC.title, title]
+        end
+      end
+      content.xpath(ATOM_XP, XPATHNS).map do |node|
+        o = RDF::URI((uri + node.content.strip).to_s)
+        @repo << [rs, RDF::Vocab::DC.references, o]
+        o
+      end
+    },
+    'application/x-rss+xml': -> content, uri {
+      rs = RDF::URI(uri.to_s)
+      if title = content.xpath('/rss/channel/title', XPATHNS)
+        if title = title_term(title)
+          @repo << [rs, RDF::Vocab::DC.title, title]
+        end
+      end
+      content.xpath(RSS_XP, XPATHNS).map do |node|
+        o = RDF::URI((uri + node.content.strip).to_s)
+        @repo << [rs, RDF::Vocab::DC.references, o]
+        o
+      end
+    },
+    'image/svg+xml': -> content, uri {
+      if title = content.xpath('//svg:title', XPATHNS)
+        if title = title_term(title)
+          @repo << [rs, RDF::Vocab::DC.title, title]
+        end
+      end
+      content.xpath(XLINK_XP, XPATHNS).map do |node|
+        o = RDF::URI((uri + node.content.strip).to_s)
+        @repo << [rs, RDF::Vocab::DC.references, o]
+        o
+      end
+    },
+    'text/css': -> content, uri {
+      rs  = RDF::URI(uri.to_s)
+      css = Crass.parse content
+      css_urls(css).map do |u|
+        ro = RDF::URI((uri + u).to_s)
+        @repo << [rs, RDF::Vocab::DC.requires, ro]
+        ro
+      end
+    }
+  }.freeze
+  XMLMAP = {
+    nil => {
+      'rss'  => 'application/x-rss+xml',
+      'html' => 'text/html',
+    },
+    XHTMLNS                       => 'application/xhtml+xml',
+    'http://www.w3.org/2005/Atom' => 'application/atom+xml',
+    'http://www.w3.org/2000/svg'  => 'image/svg+xml',
+  }.freeze
+  def xml_type doc
+    r = doc.root
+    if r and x = XMLMAP[r.namespace]
+      if x.is_a? Hash
+        return x[r.name] if x[r.name]
+      else
+        return x
+      end
+    end
+    'application/xml'
+  end
+  public
+  # NS = {
+  #   rdf:  RDF::RDFV,
+  #   rdfs: RDF::RDFS,
+  #   owl:  RDF::OWL,
+  #   xsd:  RDF::XSD,
+  #   xhv:  RDF::Vocab::XHV,
+  #   http: RDF::Vocabulary.new('http://www.w3.org/2011/http#'),
+  #   vann: RDF::Vocabulary.new('http://purl.org/vocab/vann/'),
+  #   skos: RDF::Vocab::SKOS,
+  #   dcat: RDF::Vocab::DCAT,
+  #   so:   RDF::Vocab::SCHEMA,
+  #   dct:  RDF::Vocab::DC,
+  #   ci:   RDF::SAK::CI,
+  #   ogp:  RDF::Vocab::OG,
+  #   foaf: RDF::Vocab::FOAF,
+  #   org:  RDF::Vocab::ORG,
+  #   bibo: RDF::Vocab::BIBO,
+  #   qb:   RDF::Vocabulary.new('http://purl.org/linked-data/cube#'),
+  # }.freeze
+  def initialize store: nil, repo: nil, ua: nil, ignore: nil, traverse: nil
+    @store  = store
+    @repo   = repo
+    # @urls   = Concurrent::Array.new
+    @jobs   = Concurrent::Array.new
+    @seen   = Concurrent::Map.new
+    # @fchan  = Concurrent::Promises::Channel.new 10
+    # @schan  = Concurrent::Promises::Channel.new 10
+    # @tchan  = Concurrent::Promises::Channel.new 10
+    @fthrot = Concurrent::Throttle.new 5
+    # @done   = Concurrent::Cancellation.new
+    @ua     = ua ? ua.to_s : UA
+    @ignore = ignore ? ignore.respond_to?(:to_a) ? ignore.to_a : [ignore] : []
+    @traverse = traverse ?
+      traverse.respond_to?(:to_a) ? traverse.to_a : [traverse] : []
+    # other stuff
+    #Signal.trap(:INT) { warn 'FART'; @done.origin.resolve }
+  end
+  def fetch url, redir = 10
+    # `return` in a lambda is okay but in a block you have to use
+    # `break`, and it complains if you do that
+    url = URI(url.to_s) unless url.is_a? URI
+    url.normalize!
+    # XXX apparently you can't just *return* a fulfilled future, you
+    # have to assign it to something.
+    bailout = Concurrent::Promises.fulfilled_future nil
+    return bailout unless %w[http https].include? url.scheme
+    # nuke the fragment
+    url.fragment = nil
+    # make sure we don't do this a second time
+    return bailout if seen? url
+    @seen[url.to_s] = url
+    ru = RDF::URI(url.to_s)
+    # obtain last-modified from object
+    q = RDF::Query.new { pattern [ru, RDF::Vocab::DC.modified, :m] }
+    ims = q.execute(@repo).map do |s|
+      s[:m].object.to_time.getgm if
+        s[:m].literal? and s[:m].object.respond_to? :to_time
+    end.compact.sort { |a, b| b <=> a }.first
+    # XXX this is a little too low-level, don't you think?
+    http = Net::HTTP.new(url.hostname, url.port)
+    http.continue_timeout = 10
+    http.open_timeout     = 30
+    http.read_timeout     = 10
+    http.write_timeout    = 10
+    http.use_ssl          = url.is_a?(URI::HTTPS)
+    http.start
+    hdr  = { 'User-Agent' => @ua, 'Connection' => 'close' }
+    hdr['If-Modified-Since'] = ims.rfc2822 if ims
+    req  = Net::HTTP::Get.new url, hdr
+    resp = http.request req
+    http.finish
+    case resp
+    when Net::HTTPSuccess
+      Concurrent::Promises.fulfilled_future(resp)
+    when Net::HTTPNotModified
+      warn "Already seen #{url}"
+      bailout
+    when Net::HTTPRedirection
+      raise Net::HTTPClientException.new "Too many redirects (#{redir})",
+        resp if redir <= 0
+      unless dest = resp['location']
+        raise Net::HTTPBadResponse.new(
+          "Redirect on #{url} missing Location header", resp)
+      end
+      dest = (url + dest).normalize
+      @repo << [ru, RDF::SAK::CI.canonical, RDF::URI(dest.to_s)]
+      raise Net::HTTPClientException.new "Loop detected on #{url}",
+        resp if url == dest
+      fetch(dest, redir - 1)
+    else
+      raise Net::HTTPClientException.new "Response failed #{url}", resp
+    end
+  end
+  def store resp
+    return unless resp
+    now     = Time.now.getgm # we mint a single "now" to use everywhere
+    date    = if d = resp['Date']
+                # who knows what weird shit is coming off the wire
+                d.gsub!(/^([^,]*(?:,[^,]*)?)(?:\s*,.*)?$/, "\\1")
+                Time.httpdate(d).getgm rescue now
+              else
+                now # rfc says server MUST send a date header, but does it?
+              end
+    mtime   = if lm = resp['Last-Modified']
+                # lol god two Last-Modified headers no that's not messed up
+                lm.gsub!(/^([^,]*(?:,[^,]*)?)(?:\s*,.*)?$/, "\\1")
+                delta = now - date
+                Time.httpdate(lm).getgm + delta rescue nil
+              end
+    lang    = if resp['Content-Language']
+                # same friggin deal
+                resp['Content-Language'].strip.split('\s*,+\s*').first
+              end
+    charset = nil
+    if type = resp['Content-Type']
+      # and again, wtf
+      type = type.split(/\s*,+\s*/).first || 'application/octet-stream'
+      type, *params = type.strip.split(/\s*;\s*/).reject(&:empty?)
+      params = params.map do |p|
+        p.split(/\s*=\s*/, 2)
+      end.reject { |p| p.length < 2 }.to_h.transform_keys(&:downcase)
+      charset = params['charset'] if TOKEN.match? params['charset']
+    end
+    # obj = @store.add resp.body, strict: false,
+    #   type: type, charset: charset, language: lang, mtime: mtime
+    s  = RDF::URI(resp.uri.to_s) # - the subject
+    cs = RDF::Changeset.new      # - a receptacle for statements
+    cs << [s, RDF::Vocab::DC.modified, obj.mtime.getgm]
+    #cs << [s, RDF::Vocab::DC.hasVersion, RDF::URI(obj[:"sha-256"].to_s)]
+    cs.apply @repo
+    obj
+    #nil
+  end
+  def tidy obj
+    if obj and /html/i.match? obj.type
+      # obtain original digest uri
+      oldu = RDF::URI(obj[:"sha-256"].to_s)
+      # first let's detect if this job has already been done
+      RDF::Query.new do
+        pattern [:a, RDF::SAK::TFO.input, oldu]
+        pattern [:a, RDF::SAK::TFO.transform, TIDY_U]
+        pattern [:a, RDF::SAK::TFO.output, :n]
+      end.execute(@repo).map do |s|
+        # can't completely trust what's in the repo so check then convert
+        URI(s[:n].to_s) if s[:n].uri? and s[:n].scheme.downcase == 'ni'
+      end.compact.each do |n|
+        # just because it's in the rdf doesn't mean it's in the store
+        out = @store.get n
+        return out if out and !out.deleted? # don't return an empty record
+      end
+      # tidy the object and reinsert it back into the store as xhtml
+      start = Time.now.getgm
+      if clean = TidyFFI::Tidy.clean(obj.content.read, TIDY_OPTS)
+        newobj = @store.add clean, mtime: obj.mtime,
+          type: 'application/xhtml+xml', language: obj.language,
+          charset: obj.charset, encoding: obj.encoding
+        stop = Time.now.getgm
+        newu = RDF::URI(newobj[:"sha-256"].to_s)
+        q = RDF::Query.new do
+          pattern [:a, RDF::SAK::TFO.input, oldu]
+          pattern [:a, RDF::SAK::TFO.output, newu]
+        end
+        if q.execute(@repo).empty?
+          s  = RDF::URI(UUIDTools::UUID.random_create.to_uri)
+          cs = RDF::Changeset.new
+          cs << [s, RDF.type, RDF::SAK::TFO.Application]
+          cs << [s, RDF::Vocab::PROV.startedAtTime, start]
+          cs << [s, RDF::Vocab::PROV.endedAtTime, stop]
+          cs << [s, RDF::SAK::TFO.transform, TIDY_U]
+          cs << [s, RDF::SAK::TFO.input, oldu]
+          cs << [s, RDF::SAK::TFO.output, newu]
+          cs.apply @repo
+        end
+        newobj
+      end
+    end
+  end
+  def sponge obj
+    return unless obj
+    #if obj and /xml/.match? obj.type
+    # get rdf stuff
+    ru  = RDF::URI(obj[:"sha-256"].to_s)
+    uri = RDF::Query.new do
+      pattern [:b, RDF::SAK::TFO.output, ru]
+      pattern [:b, RDF::SAK::TFO.transform, TIDY_U]
+      pattern [:b, RDF::SAK::TFO.input, :a]
+      pattern [:s, RDF::Vocab::DC.hasVersion, :a]
+    end.execute(@repo) + RDF::Query.new do
+      pattern [:s, RDF::Vocab::DC.hasVersion, ru]
+    end.execute(@repo).uniq.map do |sol|
+      u = sol[:s]
+      u if u.uri? and u.scheme and %w[http https].include? u.scheme.downcase
+    end.compact.first
+    uuri    = URI(uri ? uri_pp(uri.to_s) : ru)
+    content = obj.content
+    sponge  = SPONGE[obj.type]
+    if /xml/.match? type
+      content = Nokogiri.XML(content, uuri.to_s)
+      unless sponge
+        type   = xml_type(content)
+        sponge = SPONGE[type] || SPONGE['image/svg+xml'] # svg is just xlink
+      end
+    end
+    if sponge
+      instance_exec(content, uuri, &sponge).compact.uniq.each do |link|
+        enqueue link if traverse? link
+      end
+    end
+  end
+  def enqueue uri
+    uri = URI(uri_pp uri.to_s).normalize
+    return if seen?    uri
+    return if ignored? uri
+    warn "enqueuing #{uri}"
+    @fthrot.future(uri) do |u|
+      fr = fetch u
+      fr.on_rejection { |reason| warn "fetch fail: #{reason.inspect}" }
+      fs = fr.then do |resp|
+        begin
+          store resp
+        rescue Exception => e
+          warn e
+        end
+      end
+      fs.on_rejection { |reason| warn "store fail: #{reason.inspect}" }
+      ft = fs.then do |obj|
+        if obj and /html/i.match? obj.type
+          warn "tidying #{obj[:"sha-256"]}"
+          tidy obj
+        else
+          obj
+        end
+      end
+      ft.on_rejection { |reason| warn "tidy fail: #{reason.inspect}" }
+      ft.then { |obj| sponge obj }
+    end.on_rejection { |reason| warn "throttle fail: #{reason.inspect}" }
+  end
+  def seen? uri
+    !!@seen[uri.to_s]
+  end
+  def traverse? uri
+    return if @traverse.empty?
+    re = /(?:^|\.)#{@traverse.map { |t| Regexp.escape t }.join ?|}$/o
+    re.match? uri.host
+  end
+  def ignored? uri
+    return if @ignore.empty?
+    re = /(?:^|\.)#{@ignore.map { |t| Regexp.escape t }.join ?|}$/o
+    re.match? uri.host
+  end
+  def run urls, shuffle: false
+    # ingest URLs and prune
+    urls = urls.map { |u| URI(uri_pp u).normalize }.reject { |u| ignored? u }
+    # optionally randomize
+    urls.shuffle! if shuffle
+    # now add the queue
+    urls.map { |url| enqueue url }.map(&:wait)
+    # while job = @jobs.shift
+    #   job.wait
+    # end
+    self
+  end
+end
+if __FILE__ == $0
+  require 'pathname'
+  require 'rdf/turtle'
+  require 'rdf/lmdb'
+  require 'store/digest'
+  require 'commander'
+  Commander.configure do
+    program :name, 'URLRunner'
+    program :version, '0.0.0'
+    program :description, 'snarf a bunch of URLs for great justice'
+    command :process do |c|
+      c.syntax = 'process [options] [urls]'
+      c.description = 'just do the friggin urls already'
+      c.option '-c', '--csv FILE',
+        'a CSV file containing URLs in the first column'
+      c.option '-i', '--ignore DOMAIN[,DOMAIN...]', Array,
+        'domain(s) to ignore'
+      c.option '-t', '--traverse DOMAIN[,DOMAIN...]', Array,
+        'traverse links within hypermedia documents'
+      c.option '-p', '--print', 'print the results when finished'
+      c.option '-A', '--user-agent STRING', 'override the User-Agent string'
+      c.option '--shuffle', 'shuffle the list of URLs'
+      # wtf why did i have to do this
+      c.option '-s', '--store DIR', 'Directory for digest store'
+      c.option '-r', '--rdf DIR', 'Directory for RDF store'
+      c.action do |args, options|
+        raise ArgumentError, 'Store and RDF directories are required' unless
+          options.store and options.rdf
+        repo  = RDF::LMDB::Repository.new options.rdf, mapsize: 2**27
+        store = Store::Digest.new dir: options.store, mapsize: 2**27
+        urls = args.dup
+        if options.csv
+          require 'csv'
+          csv = Pathname(options.csv).expand_path
+          raise ArgumentError, "CSV file #{csv} is not readable" unless
+            csv.file? and csv.readable?
+          urls += CSV.read(csv).map(&:first).compact
+        end
+        RDF::SAK::URLRunner.new(
+          repo: repo, store: store, ua: options.user_agent,
+          ignore: options.ignore, traverse: options.traverse
+        ).run urls, shuffle: options.shuffle
+        if options.print
+          print repo.dump :turtle #, prefixes: URLRunner::NS
+        end
+      end
+    end
+    # XXX these should work wtf
+    #global_option '-s', '--store DIR', 'Directory for digest store'
+    #global_option '-r', '--rdf DIR', 'Directory for RDF store'
+    default_command :process
+  end
+end