infod 0.0.3.3 → 0.0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,126 +1,169 @@
1
1
  #watch __FILE__
2
-
3
- module FeedParse
4
-
5
- def html; CGI.unescapeHTML self end
6
- def cdata; sub /^\s*<\!\[CDATA\[(.*?)\]\]>\s*$/m,'\1'end
7
- def guess; send (case self
8
- when /^\s*<\!/m
9
- :cdata
10
- when /</m
11
- :id
12
- else
13
- :html
14
- end) end
15
-
16
- def parse
17
- x={} # populate XMLns prefix table
18
- match(/<(rdf|rss|feed)([^>]+)/i)[2].scan(/xmlns:?([a-z]+)?=["']?([^'">\s]+)/){|m|x[m[0]]=m[1]}
19
-
20
- # scan for resources
21
- scan(%r{<(?<ns>rss:|atom:)?(?<tag>item|entry)(?<attrs>[\s][^>]*)?>(?<inner>.*?)</\k<ns>?\k<tag>>}mi){|m|
22
- # identifier search
23
- attrs = m[2]
24
- inner = m[3]
25
- u = attrs.do{|a| # RDF-style identifier (RSS 1.0)
26
- a.match(/about=["']?([^'">\s]+)/).do{|s|
27
- s[1] }} ||
28
- (inner.match(/<link>([^<]+)/) || # <link> child-node or href attribute
29
- inner.match(/<link[^>]+rel=["']?alternate["']?[^>]+href=["']?([^'">\s]+)/) ||
30
- inner.match(/<(?:gu)?id[^>]*>([^<]+)/)).do{|s| s[1]} # <id> child
31
-
32
- if u
33
- if !u.match /^http/
34
- puts "no HTTP URIs found #{u}"
35
- u = '/junk/'+u.gsub('/','.')
36
- end
37
- yield u, R::Type, (R::SIOCt+'BlogPost').R
38
- yield u, R::Type, (R::SIOC+'Post').R
39
-
40
- #links
41
- inner.scan(%r{<(link|enclosure|media)([^>]+)>}mi){|e|
42
- e[1].match(/(href|url|src)=['"]?([^'">\s]+)/).do{|url|
43
- yield(u,R::Atom+'/link/'+((r=e[1].match(/rel=['"]?([^'">\s]+)/)) ? r[1] : e[0]), url[2].R)}}
44
-
45
- #elements
46
- inner.scan(%r{<([a-z]+:)?([a-z]+)([\s][^>]*)?>(.*?)</\1?\2>}mi){|e|
47
- yield u, # s
48
- (x[e[0]&&e[0].chop]||R::RSS)+e[1], # p
49
- e[3].extend(FeedParse).guess.do{|o|# o
50
- o.match(/\A(\/|http)[\S]+\Z/) ? o.R : R::F['cleanHTML'][o]
51
- }}
52
- else
53
- puts "no post-identifiers found #{u}"
54
- end
55
- }
56
-
57
- nil
58
- end
59
- end
60
-
61
2
  class R
62
3
 
4
+ def getFeed h='localhost'; addDocsRDF :format => :feed, :hook => FeedArchiver, :hostname => h end
5
+
63
6
  Atom = W3+'2005/Atom'
64
7
  RSS = Purl+'rss/1.0/'
65
8
  RSSm = RSS+'modules/'
66
- Feed = (R RSS+'channel')
67
9
 
68
10
  def listFeeds; (nokogiri.css 'link[rel=alternate]').map{|u|R (URI uri).merge(u.attr :href)} end
69
11
  alias_method :feeds, :listFeeds
70
12
 
71
- # add existing resources to index
72
- #
73
- # 'http:/'.R.take.select{|e|e.ext=='e'}.map{|r|R::FeedArchiver[r,r.graph,'localhost']}
13
+ module Feed
14
+
15
+ class Format < RDF::Format
16
+ content_type 'application/atom+xml', :extension => :atom
17
+ content_encoding 'utf-8'
18
+ reader { R::Feed::Reader }
19
+ end
20
+
21
+ class Reader < RDF::Reader
22
+
23
+ format Format
24
+
25
+ def initialize(input = $stdin, options = {}, &block)
26
+ @doc = (input.respond_to?(:read) ? input : StringIO.new(input.to_s)).read
27
+ if block_given?
28
+ case block.arity
29
+ when 0 then instance_eval(&block)
30
+ else block.call(self)
31
+ end
32
+ end
33
+ nil
34
+ end
35
+
36
+ def each_statement &fn
37
+ dateNormalize(:resolveURIs,:mapPredicates,:rawFeedTriples){|s,p,o|
38
+ fn.call RDF::Statement.new(s.R, p.R,
39
+ o.class == R ? o : (l = RDF::Literal o
40
+ l.datatype=RDF.XMLLiteral if p == Content
41
+ l),
42
+ :context => s.R.docBase)} end
43
+
44
+ def each_triple &block
45
+ each_statement{|s| block.call *s.to_triple}
46
+ end
74
47
 
75
- FeedArchiver = -> doc, graph, host {
76
- doc.roonga host
77
- graph.map{|u,r|
78
- r[Date].do{|t| # link doc to date-index
79
- t = t[0].gsub(/[-T]/,'/').sub /(.00.00|Z)$/, '' # trim normalized timezones and non-unique symbols
80
- stop = /\b(at|blog|com(ments)?|html|info|org|photo|p|post|r|status|tag|twitter|wordpress|www|1999|2005)\b/
81
- b = (u.sub(/http:\/\//,'.').gsub(/\W/,'..').gsub(stop,'').sub(/\d{12,}/,'')+'.').gsub /\.+/,'.'
82
- doc.ln R["http://#{host}/news/#{t}#{b}e"]}}
83
- doc}
48
+ def resolveURIs *f
49
+ send(*f){|s,p,o|
50
+ yield s, p, p == Content ?
51
+ (Nokogiri::HTML.fragment o).do{|o|
52
+ o.css('a').map{|a|
53
+ if a.has_attribute? 'href'
54
+ (a.set_attribute 'href', (URI.join s, (a.attr 'href'))) rescue nil
55
+ end}
56
+ o.to_s} : o}
57
+ end
84
58
 
85
- GREP_DIRS.push /^\/news\/\d{4}/
59
+ def mapPredicates *f
60
+ send(*f){|s,p,o|
61
+ yield s,
62
+ { Purl+'dc/elements/1.1/creator' => Creator,
63
+ Purl+'dc/elements/1.1/subject' => SIOC+'subject',
64
+ Atom+'author' => Creator,
65
+ RSS+'description' => Content,
66
+ RSS+'encoded' => Content,
67
+ RSSm+'content/encoded' => Content,
68
+ Atom+'content' => Content,
69
+ RSS+'title' => Title,
70
+ Atom+'title' => Title,
71
+ }[p]||p,
72
+ o }
73
+ end
86
74
 
87
- def getFeed g; addDocs :triplrFeed, g, nil, FeedArchiver end
88
- def getFeedReddit g; addDocs :triplrFeedReddit, g, nil, FeedArchiver end
75
+ def rawFeedTriples
76
+ x={} #XMLns prefix table
77
+ @doc.match(/<(rdf|rss|feed)([^>]+)/i)[2].scan(/xmlns:?([a-z]+)?=["']?([^'">\s]+)/){|m|x[m[0]]=m[1]}
78
+
79
+ # resources
80
+ @doc.scan(%r{<(?<ns>rss:|atom:)?(?<tag>item|entry)(?<attrs>[\s][^>]*)?>(?<inner>.*?)</\k<ns>?\k<tag>>}mi){|m|
81
+ # identifier search
82
+ attrs = m[2]
83
+ inner = m[3]
84
+ u = attrs.do{|a| # RDF-style identifier (RSS 1.0)
85
+ a.match(/about=["']?([^'">\s]+)/).do{|s|
86
+ s[1] }} ||
87
+ (inner.match(/<link>([^<]+)/) || # <link> child-node or href attribute
88
+ inner.match(/<link[^>]+rel=["']?alternate["']?[^>]+href=["']?([^'">\s]+)/) ||
89
+ inner.match(/<(?:gu)?id[^>]*>([^<]+)/)).do{|s| s[1]} # <id> child
90
+
91
+ if u
92
+ if !u.match /^http/
93
+ puts "no HTTP URIs found #{u}"
94
+ u = '/junk/'+u.gsub('/','.')
95
+ end
96
+ yield u, R::Type, (R::SIOCt+'BlogPost').R
97
+ yield u, R::Type, (R::SIOC+'Post').R
98
+
99
+ #links
100
+ inner.scan(%r{<(link|enclosure|media)([^>]+)>}mi){|e|
101
+ e[1].match(/(href|url|src)=['"]?([^'">\s]+)/).do{|url|
102
+ yield(u,R::Atom+'/link/'+((r=e[1].match(/rel=['"]?([^'">\s]+)/)) ? r[1] : e[0]), url[2].R)}}
103
+
104
+ #elements
105
+ inner.scan(%r{<([a-z]+:)?([a-z]+)([\s][^>]*)?>(.*?)</\1?\2>}mi){|e|
106
+ yield u, # s
107
+ (x[e[0]&&e[0].chop]||R::RSS)+e[1], # p
108
+ e[3].extend(SniffContent).sniff.do{|o|# o
109
+ o.match(/\A(\/|http)[\S]+\Z/) ? o.R : R::F['cleanHTML'][o]
110
+ }}
111
+ else
112
+ puts "no post-identifiers found #{u}"
113
+ end
114
+ }
89
115
 
90
- def triplrFeed &f
91
- dateNorm :contentURIresolve,:triplrFeedNormalize,:triplrFeedRaw,&f
92
- end
116
+ end
117
+
118
+ def dateNormalize *f
119
+ send(*f){|s,p,o|
120
+ yield *({'CreationDate' => true,
121
+ 'Date' => true,
122
+ RSS+'pubDate' => true,
123
+ Date => true,
124
+ Purl+'dc/elements/1.1/date' => true,
125
+ Atom+'published' => true,
126
+ Atom+'updated' => true
127
+ }[p] ?
128
+ [s,Date,Time.parse(o).utc.iso8601] : [s,p,o])}
129
+ end
93
130
 
94
- def triplrFeedReddit &f
95
- triplrFeed {|s,p,o|
96
- p == Content ?
97
- Nokogiri::HTML.parse(o).do{|o|
98
- o.css('.md').do{|o|yield s,p,o}
99
- yield s,Creator,o.css('a')[-4].child.to_s.strip
100
- yield s,Type,(SIOCt+'BoardPost').R
101
- } : (yield s,p,o)}
102
- end
131
+ end
132
+
133
+ module SniffContent
134
+
135
+ def sniff
136
+ send (case self
137
+ when /^\s*<\!/m
138
+ :cdata
139
+ when /</m
140
+ :id
141
+ else
142
+ :html
143
+ end)
144
+ end
145
+
146
+ def html
147
+ CGI.unescapeHTML self
148
+ end
149
+
150
+ def cdata
151
+ sub /^\s*<\!\[CDATA\[(.*?)\]\]>\s*$/m,'\1'
152
+ end
153
+
154
+ end
103
155
 
104
- def triplrFeedRaw &f
105
- read.to_utf8.extend(FeedParse).parse &f
106
- rescue Exception => e
107
- puts [uri,e,e.backtrace[0]].join ' '
108
156
  end
109
157
 
110
- def triplrFeedNormalize *f
111
- send(*f){|s,p,o|
112
- yield s,
113
- { Purl+'dc/elements/1.1/creator' => Creator,
114
- Purl+'dc/elements/1.1/subject' => SIOC+'subject',
115
- Atom+'author' => Creator,
116
- RSS+'description' => Content,
117
- RSS+'encoded' => Content,
118
- RSSm+'content/encoded' => Content,
119
- Atom+'content' => Content,
120
- RSS+'title' => Title,
121
- Atom+'title' => Title,
122
- }[p]||p,
123
- o } end
158
+ FeedStop = /\b(at|blog|com(ments)?|html|info|org|photo|p|post|r|status|tag|twitter|wordpress|www|1999|2005)\b/
159
+ FeedArchiver = -> doc, graph, host {
160
+ doc.roonga host
161
+ graph.query(RDF::Query::Pattern.new(:s,R[R::Date],:o)).first_value.do{|t|
162
+ time = t.gsub(/[-T]/,'/').sub /(.00.00|Z)$/, '' # trim normalized timezones
163
+ base = (graph.name.to_s.sub(/http:\/\//,'.').gsub(/\W/,'..').gsub(FeedStop,'').sub(/\d{12,}/,'')+'.').gsub /\.+/,'.'
164
+ doc.ln R["http://#{host}/news/#{time}#{base}n3"]}}
165
+
166
+ GREP_DIRS.push /^\/news\/\d{4}/
124
167
 
125
168
  fn Render+'application/atom+xml',->d,e{
126
169
  id = 'http://' + e['SERVER_NAME'] + (CGI.escapeHTML e['REQUEST_URI'])
@@ -140,8 +183,6 @@ class R
140
183
  d[Creator].do{|c|{_: :author, c: c[0]}},
141
184
  {_: :content, type: :xhtml,
142
185
  c: {xmlns:"http://www.w3.org/1999/xhtml",
143
- c: d[Content]}}].cr
144
- }}.cr
145
- ]}])}
186
+ c: d[Content]}}].cr}}.cr]}])}
146
187
 
147
188
  end
@@ -1,5 +1,9 @@
1
1
  #watch __FILE__
2
2
  class R
3
+ =begin
4
+ a RDF::URI has a path defined in names.rb, so do other concepts like a full "triple" - here we've built a RDF store using them
5
+ since this results in one path per-triple, it's mainly used for current resource-state and "backlink" (reverse order) indexing
6
+ =end
3
7
 
4
8
  def [] p; predicate p end
5
9
  def []= p,o
@@ -11,6 +15,14 @@ class R
11
15
  end
12
16
  end
13
17
 
18
+ def predicatePath p, s = true
19
+ container.as s ? p.R.shorten : p
20
+ end
21
+
22
+ def predicates
23
+ container.c.map{|c|c.base.expand.R}
24
+ end
25
+
14
26
  def predicate p, short = true
15
27
  p = predicatePath p, short
16
28
  p.node.take.map{|n|
@@ -52,7 +64,9 @@ class R
52
64
  end
53
65
  end
54
66
 
55
- def unsetFs p,o; setFs p,o,true end
67
+ def unsetFs p,o
68
+ setFs p,o,true
69
+ end
56
70
 
57
71
  def triplrInode
58
72
  if d?
@@ -71,17 +85,23 @@ class R
71
85
  def ln t, y=:link
72
86
  t = t.R
73
87
  t = t.uri[0..-2].R if t.uri[-1] == '/'
74
- if !t.e # destination exist?
88
+ if !t.e
75
89
  t.dirname.mk
76
90
  FileUtils.send y, node, t.node
77
91
  end
78
92
  end
79
93
 
94
+ def delete; node.deleteNode if e; self end
95
+ def exist?; node.exist? end
96
+ def file?; node.file? end
80
97
  def ln_s t; ln t, :symlink end
98
+ def mk; e || FileUtils.mkdir_p(d); self end
99
+ def mtime; node.stat.mtime if e end
100
+ def touch; FileUtils.touch node; self end
81
101
 
82
- def r p=false
102
+ def read p=false
83
103
  if f
84
- p ? (JSON.parse readFile) : readFile
104
+ p ? (JSON.parse File.open(d).read) : File.open(d).read
85
105
  else
86
106
  nil
87
107
  end
@@ -89,12 +109,19 @@ class R
89
109
  puts e
90
110
  end
91
111
 
92
- def w o,s=false
112
+ def write o,s=false
93
113
  dirname.mk
94
- writeFile (s ? o.to_json : o)
114
+ File.open(d,'w'){|f|
115
+ f << (s ? o.to_json : o)}
95
116
  self
96
117
  end
97
118
 
119
+ alias_method :e, :exist?
120
+ alias_method :f, :file?
121
+ alias_method :m, :mtime
122
+ alias_method :r, :read
123
+ alias_method :w, :write
124
+
98
125
  end
99
126
 
100
127
  class Pathname
@@ -1,15 +1,12 @@
1
1
  #watch __FILE__
2
2
  class R
3
3
  =begin
4
- graph construction is two-pass:
4
+ two-pass graph construction aka Protograph and Graph
5
5
 
6
- the first-pass will signify if the second-pass needs to be run. an ETag is be derived from the return-value, ideal fingerprint sources include filestats, mtime checks, extremely trivial SPARQL queries, SHA160 hashes of in-RAM entities.. <http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-25#section-2.3>
6
+ Protograph = ETag. ideal fingerprint sources include filestats, mtime checks (#m), SHA160 hashes of in-RAM entities - <http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-25#section-2.3>
7
7
 
8
- second-pass might fetch RDF from a SPARQL store. this lib was developed as an alternative to relying on (large, hard-to-implement, must be running, configured & connectable) SPARQL stores by using the filesystem as much as possible, to experiment with hybrids like SPARQLING up a set of files to be returned in standard Apache-as-static-fileserver fashion, and to webize non-RDF filesystem-content like email, directories, URLs in plain-text etc
9
-
10
- triple streams - a source function yields triples up to the caller as it finds them,
11
- a function providing a block (consumes yielded values) is a sink, both is a filter
12
- these can be stacked into pipelines. see the data-massaging stream-processing in feed.rb
8
+ a tripleStream function constructing a block (consumes yielded values) is a sink, inverse is a source, both a filter
9
+ these can be stacked into pipelines, as in feed.rb
13
10
 
14
11
  =end
15
12
 
@@ -21,33 +18,7 @@ class R
21
18
  end; m
22
19
  end
23
20
 
24
- =begin
25
- * stream triples into graph (memory)
26
- * import missing resources to store (fs)
27
- * behave as normal triplr to caller, with
28
- side-effect of import/indexing to knowledgebase
29
- =end
30
- def addDocs triplr, host, p=nil, hook=nil, &b
31
- graph = fromStream({},triplr)
32
- docs = {}
33
- graph.map{|u,r|
34
- e = u.R # resource
35
- doc = e.ef # doc
36
- doc.e || # exists - we're nondestructive here
37
- (docs[doc.uri] ||= {} # init doc-graph
38
- docs[doc.uri][u] = r # add to graph
39
- p && p.map{|p| # index predicate
40
- r[p].do{|v|v.map{|o| # values exist?
41
- e.index p,o}}})} # index triple
42
- docs.map{|d,g| # resources in docs
43
- d = d.R; puts "<#{d.docBase}>"
44
- d.w g,true # write
45
- hook[d,g,host] if hook} # insert-hook
46
- graph.triples &b if b # emit triples
47
- self
48
- end
49
-
50
- # default protograph - identity < lazy-expandable resource-thunks
21
+ # default protograph - identity + lazy-expandable resource-thunks
51
22
  # Resource, Query, Graph -> graphID
52
23
  fn 'protograph/',->e,q,g{
53
24
  g['#'] = {'uri' => '#'}
@@ -76,14 +47,9 @@ class R
76
47
  s }
77
48
 
78
49
  # fs-derived ID for a resource-set
79
- fn 'docsID',->g,q{
80
- [q.has_key?('nocache').do{|_|rand},
81
- g.sort.map{|u,r|
82
- [u, r.respond_to?(:m) && r.m]}].h }
83
-
84
- # default graph (filesystem store)
85
- # to use a different default-graph function (w/o patching here, or querystring param), define a GET handler on / (or a subdir),
86
- # update configuration such as q['graph'] = 'hexastore' and return false or call #response..
50
+ fn 'docsID',->g,q{g.sort.map{|u,r|[u, r.respond_to?(:m) && r.m]}.h }
51
+
52
+ # default graph
87
53
  fn 'graph/',->e,q,m{
88
54
  # force thunks
89
55
  m.values.map{|r|(r.env e.env).graphFromFile m if r.class == R }
@@ -120,17 +86,56 @@ class R
120
86
  ].flatten.compact
121
87
  end
122
88
 
89
+
90
+ # GET Resource -> Graph
91
+ # missing resources -> local store
92
+
93
+ # JSON + Hash variant
94
+ def addDocs triplr, host, p=nil, hook=nil, &b
95
+ graph = fromStream({},triplr)
96
+ docs = {}
97
+ graph.map{|u,r|
98
+ e = u.R # resource
99
+ doc = e.ef # doc
100
+ doc.e || # exists - we're nondestructive here
101
+ (docs[doc.uri] ||= {} # init doc-graph
102
+ docs[doc.uri][u] = r # add to graph
103
+ p && p.map{|p| # index predicate
104
+ r[p].do{|v|v.map{|o| # values exist?
105
+ e.index p,o}}})} # index triple
106
+ docs.map{|d,g| # resources in docs
107
+ d = d.R; puts "<#{d.docBase}>"
108
+ d.w g,true # write
109
+ hook[d,g,host] if hook} # insert-hook
110
+ graph.triples &b if b # emit triples
111
+ self
112
+ end
113
+ # RDF::Graph variant
114
+ def addDocsRDF options = {}
115
+ g = RDF::Repository.load self, options
116
+ g.each_graph.map{|graph|
117
+ if graph.named?
118
+ doc = graph.name.n3
119
+ unless doc.e
120
+ doc.dirname.mk
121
+ RDF::Writer.open(doc.d){|f|f << graph} ; puts "<#{doc.docBase}> #{graph.count} triples"
122
+ options[:hook][doc,graph,options[:hostname]] if options[:hook]
123
+ end
124
+ end}
125
+ g
126
+ end
127
+
123
128
  def triplrDoc &f; docBase.glob('#*').map{|s| s.triplrResource &f} end
124
129
 
125
130
  def triplrResource; predicates.map{|p| self[p].map{|o| yield uri, p.uri, o}} end
126
131
 
127
132
  def triplrJSON
128
- yield uri, '/application/json', (JSON.parse read) if e
133
+ yield uri, '/application/json', r(true) if e
129
134
  rescue Exception => e
130
135
  end
131
136
 
132
137
  def to_json *a
133
- to_h.to_json *a
138
+ {'uri' => uri}.to_json *a
134
139
  end
135
140
 
136
141
  fn Render+'application/json',->d,_=nil{d.to_json}
@@ -160,8 +165,7 @@ class Hash
160
165
  def triples &f
161
166
  map{|s,r|
162
167
  r.map{|p,o|
163
- o.class == Array ? o.each{|o| yield s,p,o} : yield(s,p,o) unless p=='uri'} if r.class == Hash
164
- }
168
+ o.justArray.map{|o|yield s,p,o} unless p=='uri'} if r.class == Hash}
165
169
  end
166
170
 
167
171
  end