RubyGems - infod - Versions diffs - 0.0.3.3 → 0.0.3.4 - Mend

infod 0.0.3.3 → 0.0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

data/infod/feed.rb CHANGED

@@ -1,126 +1,169 @@
 #watch __FILE__
-module FeedParse
-  def html; CGI.unescapeHTML self end
-  def cdata; sub /^\s*<\!\[CDATA\[(.*?)\]\]>\s*$/m,'\1'end
-  def guess; send (case self
-                   when /^\s*<\!/m
-                     :cdata
-                   when /</m
-                     :id
-                   else
-                     :html
-                   end) end
-  def parse
-    x={} # populate XMLns prefix table
-    match(/<(rdf|rss|feed)([^>]+)/i)[2].scan(/xmlns:?([a-z]+)?=["']?([^'">\s]+)/){|m|x[m[0]]=m[1]}
-    # scan for resources
-    scan(%r{<(?<ns>rss:|atom:)?(?<tag>item|entry)(?<attrs>[\s][^>]*)?>(?<inner>.*?)</\k<ns>?\k<tag>>}mi){|m|
-      # identifier search
-      attrs = m[2]
-      inner = m[3]
-      u = attrs.do{|a| # RDF-style identifier (RSS 1.0)
-        a.match(/about=["']?([^'">\s]+)/).do{|s|
-          s[1] }} ||
-      (inner.match(/<link>([^<]+)/) || # <link> child-node or href attribute
-       inner.match(/<link[^>]+rel=["']?alternate["']?[^>]+href=["']?([^'">\s]+)/) ||
-       inner.match(/<(?:gu)?id[^>]*>([^<]+)/)).do{|s| s[1]} # <id> child
-      if u
-        if !u.match /^http/
-          puts "no HTTP URIs found #{u}"
-          u = '/junk/'+u.gsub('/','.')
-        end
-        yield u, R::Type, (R::SIOCt+'BlogPost').R
-        yield u, R::Type, (R::SIOC+'Post').R
-        #links
-        inner.scan(%r{<(link|enclosure|media)([^>]+)>}mi){|e|
-          e[1].match(/(href|url|src)=['"]?([^'">\s]+)/).do{|url|
-            yield(u,R::Atom+'/link/'+((r=e[1].match(/rel=['"]?([^'">\s]+)/)) ? r[1] : e[0]), url[2].R)}}
-        #elements
-        inner.scan(%r{<([a-z]+:)?([a-z]+)([\s][^>]*)?>(.*?)</\1?\2>}mi){|e|
-          yield u,                           # s
-          (x[e[0]&&e[0].chop]||R::RSS)+e[1], # p
-          e[3].extend(FeedParse).guess.do{|o|# o
-            o.match(/\A(\/|http)[\S]+\Z/) ? o.R : R::F['cleanHTML'][o]
-          }}
-      else
-        puts "no post-identifiers found #{u}"
-      end
-      }
-    nil
-  end
-end
 class R
+  def getFeed h='localhost'; addDocsRDF :format => :feed, :hook => FeedArchiver, :hostname => h end
   Atom = W3+'2005/Atom'
    RSS = Purl+'rss/1.0/'
   RSSm = RSS+'modules/'
-  Feed = (R RSS+'channel')
   def listFeeds; (nokogiri.css 'link[rel=alternate]').map{|u|R (URI uri).merge(u.attr :href)} end
   alias_method :feeds, :listFeeds
-  # add existing resources to index
-  #
-  # 'http:/'.R.take.select{|e|e.ext=='e'}.map{|r|R::FeedArchiver[r,r.graph,'localhost']}
+  module Feed
+    class Format < RDF::Format
+      content_type     'application/atom+xml', :extension => :atom
+      content_encoding 'utf-8'
+      reader { R::Feed::Reader }
+    end
+    class Reader < RDF::Reader
+      format Format
+      def initialize(input = $stdin, options = {}, &block)
+        @doc = (input.respond_to?(:read) ? input : StringIO.new(input.to_s)).read
+        if block_given?
+          case block.arity
+          when 0 then instance_eval(&block)
+          else block.call(self)
+          end
+        end
+        nil
+      end
+      def each_statement &fn
+        dateNormalize(:resolveURIs,:mapPredicates,:rawFeedTriples){|s,p,o|
+          fn.call RDF::Statement.new(s.R, p.R,
+                                     o.class == R ? o : (l = RDF::Literal o
+                                                         l.datatype=RDF.XMLLiteral if p == Content
+                                                         l),
+                                     :context => s.R.docBase)} end
+      def each_triple &block
+        each_statement{|s| block.call *s.to_triple}
+      end
-  FeedArchiver = -> doc, graph, host {
-    doc.roonga host
-    graph.map{|u,r|
-      r[Date].do{|t| # link doc to date-index
-        t = t[0].gsub(/[-T]/,'/').sub /(.00.00|Z)$/, '' # trim normalized timezones and non-unique symbols
-        stop = /\b(at|blog|com(ments)?|html|info|org|photo|p|post|r|status|tag|twitter|wordpress|www|1999|2005)\b/
-        b = (u.sub(/http:\/\//,'.').gsub(/\W/,'..').gsub(stop,'').sub(/\d{12,}/,'')+'.').gsub /\.+/,'.'
-        doc.ln R["http://#{host}/news/#{t}#{b}e"]}}
-  doc}
+      def resolveURIs *f
+        send(*f){|s,p,o|
+          yield s, p, p == Content ?
+          (Nokogiri::HTML.fragment o).do{|o|
+            o.css('a').map{|a|
+              if a.has_attribute? 'href'
+                (a.set_attribute 'href', (URI.join s, (a.attr 'href'))) rescue nil
+              end}
+            o.to_s} : o}
+      end
-  GREP_DIRS.push /^\/news\/\d{4}/
+      def mapPredicates *f
+        send(*f){|s,p,o|
+          yield s,
+          { Purl+'dc/elements/1.1/creator' => Creator,
+            Purl+'dc/elements/1.1/subject' => SIOC+'subject',
+            Atom+'author' => Creator,
+            RSS+'description' => Content,
+            RSS+'encoded' => Content,
+            RSSm+'content/encoded' => Content,
+            Atom+'content' => Content,
+            RSS+'title' => Title,
+            Atom+'title' => Title,
+          }[p]||p,
+          o }
+      end
-  def getFeed       g; addDocs :triplrFeed, g, nil, FeedArchiver end
-  def getFeedReddit g; addDocs :triplrFeedReddit, g, nil, FeedArchiver end
+      def rawFeedTriples
+        x={} #XMLns prefix table
+        @doc.match(/<(rdf|rss|feed)([^>]+)/i)[2].scan(/xmlns:?([a-z]+)?=["']?([^'">\s]+)/){|m|x[m[0]]=m[1]}
+        # resources
+        @doc.scan(%r{<(?<ns>rss:|atom:)?(?<tag>item|entry)(?<attrs>[\s][^>]*)?>(?<inner>.*?)</\k<ns>?\k<tag>>}mi){|m|
+          # identifier search
+          attrs = m[2]
+          inner = m[3]
+          u = attrs.do{|a| # RDF-style identifier (RSS 1.0)
+            a.match(/about=["']?([^'">\s]+)/).do{|s|
+              s[1] }} ||
+          (inner.match(/<link>([^<]+)/) || # <link> child-node or href attribute
+           inner.match(/<link[^>]+rel=["']?alternate["']?[^>]+href=["']?([^'">\s]+)/) ||
+           inner.match(/<(?:gu)?id[^>]*>([^<]+)/)).do{|s| s[1]} # <id> child
+          if u
+            if !u.match /^http/
+              puts "no HTTP URIs found #{u}"
+              u = '/junk/'+u.gsub('/','.')
+            end
+            yield u, R::Type, (R::SIOCt+'BlogPost').R
+            yield u, R::Type, (R::SIOC+'Post').R
+            #links
+            inner.scan(%r{<(link|enclosure|media)([^>]+)>}mi){|e|
+              e[1].match(/(href|url|src)=['"]?([^'">\s]+)/).do{|url|
+                yield(u,R::Atom+'/link/'+((r=e[1].match(/rel=['"]?([^'">\s]+)/)) ? r[1] : e[0]), url[2].R)}}
+            #elements
+            inner.scan(%r{<([a-z]+:)?([a-z]+)([\s][^>]*)?>(.*?)</\1?\2>}mi){|e|
+              yield u,                           # s
+              (x[e[0]&&e[0].chop]||R::RSS)+e[1], # p
+           e[3].extend(SniffContent).sniff.do{|o|# o
+                o.match(/\A(\/|http)[\S]+\Z/) ? o.R : R::F['cleanHTML'][o]
+              }}
+          else
+            puts "no post-identifiers found #{u}"
+          end
+        }
-  def triplrFeed &f
-    dateNorm :contentURIresolve,:triplrFeedNormalize,:triplrFeedRaw,&f
-  end
+      end
+      def dateNormalize *f
+        send(*f){|s,p,o|
+          yield *({'CreationDate' => true,
+                    'Date' => true,
+                    RSS+'pubDate' => true,
+                    Date => true,
+                    Purl+'dc/elements/1.1/date' => true,
+                    Atom+'published' => true,
+                    Atom+'updated' => true
+                  }[p] ?
+                  [s,Date,Time.parse(o).utc.iso8601] : [s,p,o])}
+      end
-  def triplrFeedReddit &f
-    triplrFeed {|s,p,o|
-     p == Content ?
-      Nokogiri::HTML.parse(o).do{|o|
-        o.css('.md').do{|o|yield s,p,o}
-        yield s,Creator,o.css('a')[-4].child.to_s.strip
-        yield s,Type,(SIOCt+'BoardPost').R
-      } : (yield s,p,o)}
-  end
+    end
+    module SniffContent
+      def sniff
+        send (case self
+              when /^\s*<\!/m
+                :cdata
+              when /</m
+                :id
+              else
+                :html
+              end)
+      end
+      def html
+        CGI.unescapeHTML self
+      end
+      def cdata
+        sub /^\s*<\!\[CDATA\[(.*?)\]\]>\s*$/m,'\1'
+      end
+    end
-  def triplrFeedRaw &f
-    read.to_utf8.extend(FeedParse).parse &f
-  rescue Exception => e
-    puts [uri,e,e.backtrace[0]].join ' '
   end
-  def triplrFeedNormalize *f
-    send(*f){|s,p,o|
-      yield s,
-      { Purl+'dc/elements/1.1/creator' => Creator,
-        Purl+'dc/elements/1.1/subject' => SIOC+'subject',
-        Atom+'author' => Creator,
-        RSS+'description' => Content,
-        RSS+'encoded' => Content,
-        RSSm+'content/encoded' => Content,
-        Atom+'content' => Content,
-        RSS+'title' => Title,
-        Atom+'title' => Title,
-      }[p]||p,
-      o } end
+  FeedStop = /\b(at|blog|com(ments)?|html|info|org|photo|p|post|r|status|tag|twitter|wordpress|www|1999|2005)\b/
+  FeedArchiver = -> doc, graph, host {
+    doc.roonga host
+    graph.query(RDF::Query::Pattern.new(:s,R[R::Date],:o)).first_value.do{|t|
+      time = t.gsub(/[-T]/,'/').sub /(.00.00|Z)$/, '' # trim normalized timezones
+      base = (graph.name.to_s.sub(/http:\/\//,'.').gsub(/\W/,'..').gsub(FeedStop,'').sub(/\d{12,}/,'')+'.').gsub /\.+/,'.'
+      doc.ln R["http://#{host}/news/#{time}#{base}n3"]}}
+  GREP_DIRS.push /^\/news\/\d{4}/
   fn Render+'application/atom+xml',->d,e{
     id = 'http://' + e['SERVER_NAME'] + (CGI.escapeHTML e['REQUEST_URI'])
@@ -140,8 +183,6 @@ class R
                      d[Creator].do{|c|{_: :author, c: c[0]}},
                      {_: :content, type: :xhtml,
                        c: {xmlns:"http://www.w3.org/1999/xhtml",
-                         c: d[Content]}}].cr
-               }}.cr
-            ]}])}
+                         c: d[Content]}}].cr}}.cr]}])}
 end

data/infod/fs.rb CHANGED

@@ -1,5 +1,9 @@
 #watch __FILE__
 class R
+=begin
+  a RDF::URI has a path defined in names.rb, so do other concepts like a full "triple" - here we've built a RDF store using them
+  since this results in one path per-triple, it's mainly used for current resource-state and "backlink" (reverse order) indexing
+=end
   def [] p; predicate p end
   def []= p,o
@@ -11,6 +15,14 @@ class R
     end
   end
+  def predicatePath p, s = true
+    container.as s ? p.R.shorten : p
+  end
+  def predicates
+    container.c.map{|c|c.base.expand.R}
+  end
   def predicate p, short = true
     p = predicatePath p, short
     p.node.take.map{|n|
@@ -52,7 +64,9 @@ class R
     end
   end
-  def unsetFs p,o; setFs p,o,true end
+  def unsetFs p,o
+    setFs p,o,true
+  end
   def triplrInode
     if d?
@@ -71,17 +85,23 @@ class R
   def ln t, y=:link
     t = t.R
     t = t.uri[0..-2].R if t.uri[-1] == '/'
-    if !t.e # destination exist?
+    if !t.e
       t.dirname.mk
       FileUtils.send y, node, t.node
     end
   end
+  def delete;   node.deleteNode if e; self end
+  def exist?;   node.exist? end
+  def file?;    node.file? end
   def ln_s t; ln t, :symlink end
+  def mk;       e || FileUtils.mkdir_p(d); self end
+  def mtime;    node.stat.mtime if e end
+  def touch;    FileUtils.touch node; self end
-  def r p=false
+  def read p=false
     if f
-      p ? (JSON.parse readFile) : readFile
+      p ? (JSON.parse File.open(d).read) : File.open(d).read
     else
       nil
     end
@@ -89,12 +109,19 @@ class R
     puts e
   end
-  def w o,s=false
+  def write o,s=false
     dirname.mk
-    writeFile (s ? o.to_json : o)
+    File.open(d,'w'){|f|
+      f << (s ? o.to_json : o)}
     self
   end
+  alias_method :e, :exist?
+  alias_method :f, :file?
+  alias_method :m, :mtime
+  alias_method :r, :read
+  alias_method :w, :write
 end
 class Pathname

data/infod/graph.rb CHANGED

@@ -1,15 +1,12 @@
 #watch __FILE__
 class R
 =begin
-  graph construction is two-pass:
+  two-pass graph construction aka Protograph and Graph
- the first-pass will signify if the second-pass needs to be run. an ETag is be derived from the return-value, ideal fingerprint sources include filestats, mtime checks, extremely trivial SPARQL queries, SHA160 hashes of in-RAM entities.. <http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-25#section-2.3>
+  Protograph = ETag. ideal fingerprint sources include filestats, mtime checks (#m), SHA160 hashes of in-RAM entities - <http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-25#section-2.3>
-   second-pass might fetch RDF from a SPARQL store. this lib was developed as an alternative to relying on (large, hard-to-implement, must be running, configured & connectable) SPARQL stores by using the filesystem as much as possible, to experiment with hybrids like SPARQLING up a set of files to be returned in standard Apache-as-static-fileserver fashion, and to webize non-RDF filesystem-content like email, directories, URLs in plain-text etc
-  triple streams - a source function yields triples up to the caller as it finds them,
-  a function providing a block (consumes yielded values) is a sink, both is a filter
-  these can be stacked into pipelines. see the data-massaging stream-processing in feed.rb
+  a tripleStream function constructing a block (consumes yielded values) is a sink, inverse is a source, both a filter
+  these can be stacked into pipelines, as in feed.rb
 =end
@@ -21,33 +18,7 @@ class R
     end; m
   end
-=begin
- * stream triples into graph (memory)
- * import missing resources to store (fs)
- * behave as normal triplr to caller, with
-   side-effect of import/indexing to knowledgebase
-=end
-  def addDocs triplr, host, p=nil, hook=nil, &b
-    graph = fromStream({},triplr)
-    docs = {}
-    graph.map{|u,r|
-      e = u.R                 # resource
-      doc = e.ef              # doc
-      doc.e ||                # exists - we're nondestructive here
-      (docs[doc.uri] ||= {}   # init doc-graph
-       docs[doc.uri][u] = r   # add to graph
-       p && p.map{|p|         # index predicate
-         r[p].do{|v|v.map{|o| # values exist?
-             e.index p,o}}})} # index triple
-    docs.map{|d,g|            # resources in docs
-      d = d.R; puts "<#{d.docBase}>"
-      d.w g,true              # write
-      hook[d,g,host] if hook} # insert-hook
-    graph.triples &b if b     # emit triples
-    self
-  end
-  # default protograph - identity < lazy-expandable resource-thunks
+  # default protograph - identity + lazy-expandable resource-thunks
   # Resource, Query, Graph -> graphID
   fn 'protograph/',->e,q,g{
      g['#'] = {'uri' => '#'}
@@ -76,14 +47,9 @@ class R
     s }
   # fs-derived ID for a resource-set
-  fn 'docsID',->g,q{
-   [q.has_key?('nocache').do{|_|rand},
-     g.sort.map{|u,r|
-       [u, r.respond_to?(:m) && r.m]}].h }
-  # default graph (filesystem store)
-  # to use a different default-graph function (w/o patching here, or querystring param), define a GET handler on / (or a subdir),
-  # update configuration such as q['graph'] = 'hexastore' and return false or call #response..
+  fn 'docsID',->g,q{g.sort.map{|u,r|[u, r.respond_to?(:m) && r.m]}.h }
+  # default graph
   fn 'graph/',->e,q,m{
     # force thunks
     m.values.map{|r|(r.env e.env).graphFromFile m if r.class == R }
@@ -120,17 +86,56 @@ class R
     ].flatten.compact
   end
+# GET Resource -> Graph
+# missing resources -> local store
+  # JSON + Hash variant
+  def addDocs triplr, host, p=nil, hook=nil, &b
+    graph = fromStream({},triplr)
+    docs = {}
+    graph.map{|u,r|
+      e = u.R                 # resource
+      doc = e.ef              # doc
+      doc.e ||                # exists - we're nondestructive here
+      (docs[doc.uri] ||= {}   # init doc-graph
+       docs[doc.uri][u] = r   # add to graph
+       p && p.map{|p|         # index predicate
+         r[p].do{|v|v.map{|o| # values exist?
+             e.index p,o}}})} # index triple
+    docs.map{|d,g|            # resources in docs
+      d = d.R; puts "<#{d.docBase}>"
+      d.w g,true              # write
+      hook[d,g,host] if hook} # insert-hook
+    graph.triples &b if b     # emit triples
+    self
+  end
+  # RDF::Graph variant
+  def addDocsRDF options = {}
+    g = RDF::Repository.load self, options
+    g.each_graph.map{|graph|
+      if graph.named?
+        doc = graph.name.n3
+        unless doc.e
+          doc.dirname.mk
+          RDF::Writer.open(doc.d){|f|f << graph} ; puts "<#{doc.docBase}> #{graph.count} triples"
+          options[:hook][doc,graph,options[:hostname]] if options[:hook]
+        end
+      end}
+    g
+  end
   def triplrDoc &f; docBase.glob('#*').map{|s| s.triplrResource &f} end
   def triplrResource; predicates.map{|p| self[p].map{|o| yield uri, p.uri, o}} end
   def triplrJSON
-    yield uri, '/application/json', (JSON.parse read) if e
+    yield uri, '/application/json', r(true) if e
   rescue Exception => e
   end
   def to_json *a
-    to_h.to_json *a
+    {'uri' => uri}.to_json *a
   end
   fn Render+'application/json',->d,_=nil{d.to_json}
@@ -160,8 +165,7 @@ class Hash
   def triples &f
     map{|s,r|
       r.map{|p,o|
-        o.class == Array ? o.each{|o| yield s,p,o} : yield(s,p,o) unless p=='uri'} if r.class == Hash
-    }
+        o.justArray.map{|o|yield s,p,o} unless p=='uri'} if r.class == Hash}
   end
 end