infod 0.0.3.3 → 0.0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,126 +1,169 @@
1
1
  #watch __FILE__
2
-
3
- module FeedParse
4
-
5
- def html; CGI.unescapeHTML self end
6
- def cdata; sub /^\s*<\!\[CDATA\[(.*?)\]\]>\s*$/m,'\1'end
7
- def guess; send (case self
8
- when /^\s*<\!/m
9
- :cdata
10
- when /</m
11
- :id
12
- else
13
- :html
14
- end) end
15
-
16
- def parse
17
- x={} # populate XMLns prefix table
18
- match(/<(rdf|rss|feed)([^>]+)/i)[2].scan(/xmlns:?([a-z]+)?=["']?([^'">\s]+)/){|m|x[m[0]]=m[1]}
19
-
20
- # scan for resources
21
- scan(%r{<(?<ns>rss:|atom:)?(?<tag>item|entry)(?<attrs>[\s][^>]*)?>(?<inner>.*?)</\k<ns>?\k<tag>>}mi){|m|
22
- # identifier search
23
- attrs = m[2]
24
- inner = m[3]
25
- u = attrs.do{|a| # RDF-style identifier (RSS 1.0)
26
- a.match(/about=["']?([^'">\s]+)/).do{|s|
27
- s[1] }} ||
28
- (inner.match(/<link>([^<]+)/) || # <link> child-node or href attribute
29
- inner.match(/<link[^>]+rel=["']?alternate["']?[^>]+href=["']?([^'">\s]+)/) ||
30
- inner.match(/<(?:gu)?id[^>]*>([^<]+)/)).do{|s| s[1]} # <id> child
31
-
32
- if u
33
- if !u.match /^http/
34
- puts "no HTTP URIs found #{u}"
35
- u = '/junk/'+u.gsub('/','.')
36
- end
37
- yield u, R::Type, (R::SIOCt+'BlogPost').R
38
- yield u, R::Type, (R::SIOC+'Post').R
39
-
40
- #links
41
- inner.scan(%r{<(link|enclosure|media)([^>]+)>}mi){|e|
42
- e[1].match(/(href|url|src)=['"]?([^'">\s]+)/).do{|url|
43
- yield(u,R::Atom+'/link/'+((r=e[1].match(/rel=['"]?([^'">\s]+)/)) ? r[1] : e[0]), url[2].R)}}
44
-
45
- #elements
46
- inner.scan(%r{<([a-z]+:)?([a-z]+)([\s][^>]*)?>(.*?)</\1?\2>}mi){|e|
47
- yield u, # s
48
- (x[e[0]&&e[0].chop]||R::RSS)+e[1], # p
49
- e[3].extend(FeedParse).guess.do{|o|# o
50
- o.match(/\A(\/|http)[\S]+\Z/) ? o.R : R::F['cleanHTML'][o]
51
- }}
52
- else
53
- puts "no post-identifiers found #{u}"
54
- end
55
- }
56
-
57
- nil
58
- end
59
- end
60
-
61
2
  class R
62
3
 
4
+ def getFeed h='localhost'; addDocsRDF :format => :feed, :hook => FeedArchiver, :hostname => h end
5
+
63
6
  Atom = W3+'2005/Atom'
64
7
  RSS = Purl+'rss/1.0/'
65
8
  RSSm = RSS+'modules/'
66
- Feed = (R RSS+'channel')
67
9
 
68
10
  def listFeeds; (nokogiri.css 'link[rel=alternate]').map{|u|R (URI uri).merge(u.attr :href)} end
69
11
  alias_method :feeds, :listFeeds
70
12
 
71
- # add existing resources to index
72
- #
73
- # 'http:/'.R.take.select{|e|e.ext=='e'}.map{|r|R::FeedArchiver[r,r.graph,'localhost']}
13
+ module Feed
14
+
15
+ class Format < RDF::Format
16
+ content_type 'application/atom+xml', :extension => :atom
17
+ content_encoding 'utf-8'
18
+ reader { R::Feed::Reader }
19
+ end
20
+
21
+ class Reader < RDF::Reader
22
+
23
+ format Format
24
+
25
+ def initialize(input = $stdin, options = {}, &block)
26
+ @doc = (input.respond_to?(:read) ? input : StringIO.new(input.to_s)).read
27
+ if block_given?
28
+ case block.arity
29
+ when 0 then instance_eval(&block)
30
+ else block.call(self)
31
+ end
32
+ end
33
+ nil
34
+ end
35
+
36
+ def each_statement &fn
37
+ dateNormalize(:resolveURIs,:mapPredicates,:rawFeedTriples){|s,p,o|
38
+ fn.call RDF::Statement.new(s.R, p.R,
39
+ o.class == R ? o : (l = RDF::Literal o
40
+ l.datatype=RDF.XMLLiteral if p == Content
41
+ l),
42
+ :context => s.R.docBase)} end
43
+
44
+ def each_triple &block
45
+ each_statement{|s| block.call *s.to_triple}
46
+ end
74
47
 
75
- FeedArchiver = -> doc, graph, host {
76
- doc.roonga host
77
- graph.map{|u,r|
78
- r[Date].do{|t| # link doc to date-index
79
- t = t[0].gsub(/[-T]/,'/').sub /(.00.00|Z)$/, '' # trim normalized timezones and non-unique symbols
80
- stop = /\b(at|blog|com(ments)?|html|info|org|photo|p|post|r|status|tag|twitter|wordpress|www|1999|2005)\b/
81
- b = (u.sub(/http:\/\//,'.').gsub(/\W/,'..').gsub(stop,'').sub(/\d{12,}/,'')+'.').gsub /\.+/,'.'
82
- doc.ln R["http://#{host}/news/#{t}#{b}e"]}}
83
- doc}
48
+ def resolveURIs *f
49
+ send(*f){|s,p,o|
50
+ yield s, p, p == Content ?
51
+ (Nokogiri::HTML.fragment o).do{|o|
52
+ o.css('a').map{|a|
53
+ if a.has_attribute? 'href'
54
+ (a.set_attribute 'href', (URI.join s, (a.attr 'href'))) rescue nil
55
+ end}
56
+ o.to_s} : o}
57
+ end
84
58
 
85
- GREP_DIRS.push /^\/news\/\d{4}/
59
+ def mapPredicates *f
60
+ send(*f){|s,p,o|
61
+ yield s,
62
+ { Purl+'dc/elements/1.1/creator' => Creator,
63
+ Purl+'dc/elements/1.1/subject' => SIOC+'subject',
64
+ Atom+'author' => Creator,
65
+ RSS+'description' => Content,
66
+ RSS+'encoded' => Content,
67
+ RSSm+'content/encoded' => Content,
68
+ Atom+'content' => Content,
69
+ RSS+'title' => Title,
70
+ Atom+'title' => Title,
71
+ }[p]||p,
72
+ o }
73
+ end
86
74
 
87
- def getFeed g; addDocs :triplrFeed, g, nil, FeedArchiver end
88
- def getFeedReddit g; addDocs :triplrFeedReddit, g, nil, FeedArchiver end
75
+ def rawFeedTriples
76
+ x={} #XMLns prefix table
77
+ @doc.match(/<(rdf|rss|feed)([^>]+)/i)[2].scan(/xmlns:?([a-z]+)?=["']?([^'">\s]+)/){|m|x[m[0]]=m[1]}
78
+
79
+ # resources
80
+ @doc.scan(%r{<(?<ns>rss:|atom:)?(?<tag>item|entry)(?<attrs>[\s][^>]*)?>(?<inner>.*?)</\k<ns>?\k<tag>>}mi){|m|
81
+ # identifier search
82
+ attrs = m[2]
83
+ inner = m[3]
84
+ u = attrs.do{|a| # RDF-style identifier (RSS 1.0)
85
+ a.match(/about=["']?([^'">\s]+)/).do{|s|
86
+ s[1] }} ||
87
+ (inner.match(/<link>([^<]+)/) || # <link> child-node or href attribute
88
+ inner.match(/<link[^>]+rel=["']?alternate["']?[^>]+href=["']?([^'">\s]+)/) ||
89
+ inner.match(/<(?:gu)?id[^>]*>([^<]+)/)).do{|s| s[1]} # <id> child
90
+
91
+ if u
92
+ if !u.match /^http/
93
+ puts "no HTTP URIs found #{u}"
94
+ u = '/junk/'+u.gsub('/','.')
95
+ end
96
+ yield u, R::Type, (R::SIOCt+'BlogPost').R
97
+ yield u, R::Type, (R::SIOC+'Post').R
98
+
99
+ #links
100
+ inner.scan(%r{<(link|enclosure|media)([^>]+)>}mi){|e|
101
+ e[1].match(/(href|url|src)=['"]?([^'">\s]+)/).do{|url|
102
+ yield(u,R::Atom+'/link/'+((r=e[1].match(/rel=['"]?([^'">\s]+)/)) ? r[1] : e[0]), url[2].R)}}
103
+
104
+ #elements
105
+ inner.scan(%r{<([a-z]+:)?([a-z]+)([\s][^>]*)?>(.*?)</\1?\2>}mi){|e|
106
+ yield u, # s
107
+ (x[e[0]&&e[0].chop]||R::RSS)+e[1], # p
108
+ e[3].extend(SniffContent).sniff.do{|o|# o
109
+ o.match(/\A(\/|http)[\S]+\Z/) ? o.R : R::F['cleanHTML'][o]
110
+ }}
111
+ else
112
+ puts "no post-identifiers found #{u}"
113
+ end
114
+ }
89
115
 
90
- def triplrFeed &f
91
- dateNorm :contentURIresolve,:triplrFeedNormalize,:triplrFeedRaw,&f
92
- end
116
+ end
117
+
118
+ def dateNormalize *f
119
+ send(*f){|s,p,o|
120
+ yield *({'CreationDate' => true,
121
+ 'Date' => true,
122
+ RSS+'pubDate' => true,
123
+ Date => true,
124
+ Purl+'dc/elements/1.1/date' => true,
125
+ Atom+'published' => true,
126
+ Atom+'updated' => true
127
+ }[p] ?
128
+ [s,Date,Time.parse(o).utc.iso8601] : [s,p,o])}
129
+ end
93
130
 
94
- def triplrFeedReddit &f
95
- triplrFeed {|s,p,o|
96
- p == Content ?
97
- Nokogiri::HTML.parse(o).do{|o|
98
- o.css('.md').do{|o|yield s,p,o}
99
- yield s,Creator,o.css('a')[-4].child.to_s.strip
100
- yield s,Type,(SIOCt+'BoardPost').R
101
- } : (yield s,p,o)}
102
- end
131
+ end
132
+
133
+ module SniffContent
134
+
135
+ def sniff
136
+ send (case self
137
+ when /^\s*<\!/m
138
+ :cdata
139
+ when /</m
140
+ :id
141
+ else
142
+ :html
143
+ end)
144
+ end
145
+
146
+ def html
147
+ CGI.unescapeHTML self
148
+ end
149
+
150
+ def cdata
151
+ sub /^\s*<\!\[CDATA\[(.*?)\]\]>\s*$/m,'\1'
152
+ end
153
+
154
+ end
103
155
 
104
- def triplrFeedRaw &f
105
- read.to_utf8.extend(FeedParse).parse &f
106
- rescue Exception => e
107
- puts [uri,e,e.backtrace[0]].join ' '
108
156
  end
109
157
 
110
- def triplrFeedNormalize *f
111
- send(*f){|s,p,o|
112
- yield s,
113
- { Purl+'dc/elements/1.1/creator' => Creator,
114
- Purl+'dc/elements/1.1/subject' => SIOC+'subject',
115
- Atom+'author' => Creator,
116
- RSS+'description' => Content,
117
- RSS+'encoded' => Content,
118
- RSSm+'content/encoded' => Content,
119
- Atom+'content' => Content,
120
- RSS+'title' => Title,
121
- Atom+'title' => Title,
122
- }[p]||p,
123
- o } end
158
+ FeedStop = /\b(at|blog|com(ments)?|html|info|org|photo|p|post|r|status|tag|twitter|wordpress|www|1999|2005)\b/
159
+ FeedArchiver = -> doc, graph, host {
160
+ doc.roonga host
161
+ graph.query(RDF::Query::Pattern.new(:s,R[R::Date],:o)).first_value.do{|t|
162
+ time = t.gsub(/[-T]/,'/').sub /(.00.00|Z)$/, '' # trim normalized timezones
163
+ base = (graph.name.to_s.sub(/http:\/\//,'.').gsub(/\W/,'..').gsub(FeedStop,'').sub(/\d{12,}/,'')+'.').gsub /\.+/,'.'
164
+ doc.ln R["http://#{host}/news/#{time}#{base}n3"]}}
165
+
166
+ GREP_DIRS.push /^\/news\/\d{4}/
124
167
 
125
168
  fn Render+'application/atom+xml',->d,e{
126
169
  id = 'http://' + e['SERVER_NAME'] + (CGI.escapeHTML e['REQUEST_URI'])
@@ -140,8 +183,6 @@ class R
140
183
  d[Creator].do{|c|{_: :author, c: c[0]}},
141
184
  {_: :content, type: :xhtml,
142
185
  c: {xmlns:"http://www.w3.org/1999/xhtml",
143
- c: d[Content]}}].cr
144
- }}.cr
145
- ]}])}
186
+ c: d[Content]}}].cr}}.cr]}])}
146
187
 
147
188
  end
@@ -1,5 +1,9 @@
1
1
  #watch __FILE__
2
2
  class R
3
+ =begin
4
+ a RDF::URI has a path defined in names.rb, so do other concepts like a full "triple" - here we've built a RDF store using them
5
+ since this results in one path per-triple, it's mainly used for current resource-state and "backlink" (reverse order) indexing
6
+ =end
3
7
 
4
8
  def [] p; predicate p end
5
9
  def []= p,o
@@ -11,6 +15,14 @@ class R
11
15
  end
12
16
  end
13
17
 
18
+ def predicatePath p, s = true
19
+ container.as s ? p.R.shorten : p
20
+ end
21
+
22
+ def predicates
23
+ container.c.map{|c|c.base.expand.R}
24
+ end
25
+
14
26
  def predicate p, short = true
15
27
  p = predicatePath p, short
16
28
  p.node.take.map{|n|
@@ -52,7 +64,9 @@ class R
52
64
  end
53
65
  end
54
66
 
55
- def unsetFs p,o; setFs p,o,true end
67
+ def unsetFs p,o
68
+ setFs p,o,true
69
+ end
56
70
 
57
71
  def triplrInode
58
72
  if d?
@@ -71,17 +85,23 @@ class R
71
85
  def ln t, y=:link
72
86
  t = t.R
73
87
  t = t.uri[0..-2].R if t.uri[-1] == '/'
74
- if !t.e # destination exist?
88
+ if !t.e
75
89
  t.dirname.mk
76
90
  FileUtils.send y, node, t.node
77
91
  end
78
92
  end
79
93
 
94
+ def delete; node.deleteNode if e; self end
95
+ def exist?; node.exist? end
96
+ def file?; node.file? end
80
97
  def ln_s t; ln t, :symlink end
98
+ def mk; e || FileUtils.mkdir_p(d); self end
99
+ def mtime; node.stat.mtime if e end
100
+ def touch; FileUtils.touch node; self end
81
101
 
82
- def r p=false
102
+ def read p=false
83
103
  if f
84
- p ? (JSON.parse readFile) : readFile
104
+ p ? (JSON.parse File.open(d).read) : File.open(d).read
85
105
  else
86
106
  nil
87
107
  end
@@ -89,12 +109,19 @@ class R
89
109
  puts e
90
110
  end
91
111
 
92
- def w o,s=false
112
+ def write o,s=false
93
113
  dirname.mk
94
- writeFile (s ? o.to_json : o)
114
+ File.open(d,'w'){|f|
115
+ f << (s ? o.to_json : o)}
95
116
  self
96
117
  end
97
118
 
119
+ alias_method :e, :exist?
120
+ alias_method :f, :file?
121
+ alias_method :m, :mtime
122
+ alias_method :r, :read
123
+ alias_method :w, :write
124
+
98
125
  end
99
126
 
100
127
  class Pathname
@@ -1,15 +1,12 @@
1
1
  #watch __FILE__
2
2
  class R
3
3
  =begin
4
- graph construction is two-pass:
4
+ two-pass graph construction aka Protograph and Graph
5
5
 
6
- the first-pass will signify if the second-pass needs to be run. an ETag is be derived from the return-value, ideal fingerprint sources include filestats, mtime checks, extremely trivial SPARQL queries, SHA160 hashes of in-RAM entities.. <http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-25#section-2.3>
6
+ Protograph = ETag. ideal fingerprint sources include filestats, mtime checks (#m), SHA160 hashes of in-RAM entities - <http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-25#section-2.3>
7
7
 
8
- second-pass might fetch RDF from a SPARQL store. this lib was developed as an alternative to relying on (large, hard-to-implement, must be running, configured & connectable) SPARQL stores by using the filesystem as much as possible, to experiment with hybrids like SPARQLING up a set of files to be returned in standard Apache-as-static-fileserver fashion, and to webize non-RDF filesystem-content like email, directories, URLs in plain-text etc
9
-
10
- triple streams - a source function yields triples up to the caller as it finds them,
11
- a function providing a block (consumes yielded values) is a sink, both is a filter
12
- these can be stacked into pipelines. see the data-massaging stream-processing in feed.rb
8
+ a tripleStream function constructing a block (consumes yielded values) is a sink, inverse is a source, both a filter
9
+ these can be stacked into pipelines, as in feed.rb
13
10
 
14
11
  =end
15
12
 
@@ -21,33 +18,7 @@ class R
21
18
  end; m
22
19
  end
23
20
 
24
- =begin
25
- * stream triples into graph (memory)
26
- * import missing resources to store (fs)
27
- * behave as normal triplr to caller, with
28
- side-effect of import/indexing to knowledgebase
29
- =end
30
- def addDocs triplr, host, p=nil, hook=nil, &b
31
- graph = fromStream({},triplr)
32
- docs = {}
33
- graph.map{|u,r|
34
- e = u.R # resource
35
- doc = e.ef # doc
36
- doc.e || # exists - we're nondestructive here
37
- (docs[doc.uri] ||= {} # init doc-graph
38
- docs[doc.uri][u] = r # add to graph
39
- p && p.map{|p| # index predicate
40
- r[p].do{|v|v.map{|o| # values exist?
41
- e.index p,o}}})} # index triple
42
- docs.map{|d,g| # resources in docs
43
- d = d.R; puts "<#{d.docBase}>"
44
- d.w g,true # write
45
- hook[d,g,host] if hook} # insert-hook
46
- graph.triples &b if b # emit triples
47
- self
48
- end
49
-
50
- # default protograph - identity < lazy-expandable resource-thunks
21
+ # default protograph - identity + lazy-expandable resource-thunks
51
22
  # Resource, Query, Graph -> graphID
52
23
  fn 'protograph/',->e,q,g{
53
24
  g['#'] = {'uri' => '#'}
@@ -76,14 +47,9 @@ class R
76
47
  s }
77
48
 
78
49
  # fs-derived ID for a resource-set
79
- fn 'docsID',->g,q{
80
- [q.has_key?('nocache').do{|_|rand},
81
- g.sort.map{|u,r|
82
- [u, r.respond_to?(:m) && r.m]}].h }
83
-
84
- # default graph (filesystem store)
85
- # to use a different default-graph function (w/o patching here, or querystring param), define a GET handler on / (or a subdir),
86
- # update configuration such as q['graph'] = 'hexastore' and return false or call #response..
50
+ fn 'docsID',->g,q{g.sort.map{|u,r|[u, r.respond_to?(:m) && r.m]}.h }
51
+
52
+ # default graph
87
53
  fn 'graph/',->e,q,m{
88
54
  # force thunks
89
55
  m.values.map{|r|(r.env e.env).graphFromFile m if r.class == R }
@@ -120,17 +86,56 @@ class R
120
86
  ].flatten.compact
121
87
  end
122
88
 
89
+
90
+ # GET Resource -> Graph
91
+ # missing resources -> local store
92
+
93
+ # JSON + Hash variant
94
+ def addDocs triplr, host, p=nil, hook=nil, &b
95
+ graph = fromStream({},triplr)
96
+ docs = {}
97
+ graph.map{|u,r|
98
+ e = u.R # resource
99
+ doc = e.ef # doc
100
+ doc.e || # exists - we're nondestructive here
101
+ (docs[doc.uri] ||= {} # init doc-graph
102
+ docs[doc.uri][u] = r # add to graph
103
+ p && p.map{|p| # index predicate
104
+ r[p].do{|v|v.map{|o| # values exist?
105
+ e.index p,o}}})} # index triple
106
+ docs.map{|d,g| # resources in docs
107
+ d = d.R; puts "<#{d.docBase}>"
108
+ d.w g,true # write
109
+ hook[d,g,host] if hook} # insert-hook
110
+ graph.triples &b if b # emit triples
111
+ self
112
+ end
113
+ # RDF::Graph variant
114
+ def addDocsRDF options = {}
115
+ g = RDF::Repository.load self, options
116
+ g.each_graph.map{|graph|
117
+ if graph.named?
118
+ doc = graph.name.n3
119
+ unless doc.e
120
+ doc.dirname.mk
121
+ RDF::Writer.open(doc.d){|f|f << graph} ; puts "<#{doc.docBase}> #{graph.count} triples"
122
+ options[:hook][doc,graph,options[:hostname]] if options[:hook]
123
+ end
124
+ end}
125
+ g
126
+ end
127
+
123
128
  def triplrDoc &f; docBase.glob('#*').map{|s| s.triplrResource &f} end
124
129
 
125
130
  def triplrResource; predicates.map{|p| self[p].map{|o| yield uri, p.uri, o}} end
126
131
 
127
132
  def triplrJSON
128
- yield uri, '/application/json', (JSON.parse read) if e
133
+ yield uri, '/application/json', r(true) if e
129
134
  rescue Exception => e
130
135
  end
131
136
 
132
137
  def to_json *a
133
- to_h.to_json *a
138
+ {'uri' => uri}.to_json *a
134
139
  end
135
140
 
136
141
  fn Render+'application/json',->d,_=nil{d.to_json}
@@ -160,8 +165,7 @@ class Hash
160
165
  def triples &f
161
166
  map{|s,r|
162
167
  r.map{|p,o|
163
- o.class == Array ? o.each{|o| yield s,p,o} : yield(s,p,o) unless p=='uri'} if r.class == Hash
164
- }
168
+ o.justArray.map{|o|yield s,p,o} unless p=='uri'} if r.class == Hash}
165
169
  end
166
170
 
167
171
  end