pho 0.7.8 → 0.7.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. data/CHANGES +8 -0
  2. data/Rakefile +4 -2
  3. data/doc/rdoc/classes/Pho.html +4 -0
  4. data/doc/rdoc/classes/Pho/CommandLine.html +114 -114
  5. data/doc/rdoc/classes/Pho/DatatypeProperty.html +12 -12
  6. data/doc/rdoc/classes/Pho/Enrichment/StoreEnricher.html +29 -29
  7. data/doc/rdoc/classes/Pho/Etags.html +36 -36
  8. data/doc/rdoc/classes/Pho/Facet/Results.html +19 -19
  9. data/doc/rdoc/classes/Pho/Facet/Term.html +6 -6
  10. data/doc/rdoc/classes/Pho/FieldPredicateMap.html +102 -102
  11. data/doc/rdoc/classes/Pho/FieldWeighting.html +12 -12
  12. data/doc/rdoc/classes/Pho/FileManagement.html +16 -0
  13. data/doc/rdoc/classes/Pho/FileManagement/AbstractFileManager.html +168 -108
  14. data/doc/rdoc/classes/Pho/FileManagement/BNodeRewritingHandler.html +226 -0
  15. data/doc/rdoc/classes/Pho/FileManagement/FileManager.html +48 -35
  16. data/doc/rdoc/classes/Pho/FileManagement/FileSplitter.html +291 -0
  17. data/doc/rdoc/classes/Pho/FileManagement/RDFManager.html +22 -22
  18. data/doc/rdoc/classes/Pho/FileManagement/StatementHandler.html +152 -0
  19. data/doc/rdoc/classes/Pho/FileManagement/Util.html +212 -0
  20. data/doc/rdoc/classes/Pho/Job.html +64 -64
  21. data/doc/rdoc/classes/Pho/Jobs.html +60 -60
  22. data/doc/rdoc/classes/Pho/OAI/Record.html +6 -6
  23. data/doc/rdoc/classes/Pho/OAI/Records.html +24 -24
  24. data/doc/rdoc/classes/Pho/OAI/Statistics.html +12 -12
  25. data/doc/rdoc/classes/Pho/QueryProfile.html +66 -66
  26. data/doc/rdoc/classes/Pho/ResourceHash/Converter.html +36 -36
  27. data/doc/rdoc/classes/Pho/ResourceHash/SetAlgebra.html +12 -12
  28. data/doc/rdoc/classes/Pho/Snapshot.html +35 -35
  29. data/doc/rdoc/classes/Pho/Sparql/SparqlClient.html +74 -74
  30. data/doc/rdoc/classes/Pho/Sparql/SparqlHelper.html +78 -78
  31. data/doc/rdoc/classes/Pho/Status.html +26 -26
  32. data/doc/rdoc/classes/Pho/Store.html +215 -215
  33. data/doc/rdoc/classes/Pho/StoreSparqlClient.html +12 -12
  34. data/doc/rdoc/classes/Pho/Update/Changeset.html +63 -63
  35. data/doc/rdoc/classes/Pho/Update/ChangesetBuilder.html +24 -24
  36. data/doc/rdoc/classes/Pho/Update/ChangesetHelper.html +6 -6
  37. data/doc/rdoc/classes/Pho/Update/Changesets.html +12 -12
  38. data/doc/rdoc/classes/Pho/Update/LiteralStatement.html +18 -18
  39. data/doc/rdoc/classes/Pho/Update/ResourceStatement.html +24 -24
  40. data/doc/rdoc/classes/Pho/Update/Statement.html +24 -24
  41. data/doc/rdoc/classes/String.html +1 -1
  42. data/doc/rdoc/created.rid +1 -1
  43. data/doc/rdoc/files/CHANGES.html +13 -1
  44. data/doc/rdoc/files/lib/pho/file_management_rb.html +1 -1
  45. data/doc/rdoc/files/lib/pho/file_manager_rb.html +1 -1
  46. data/doc/rdoc/files/lib/pho/rdf_collection_rb.html +1 -1
  47. data/doc/rdoc/files/lib/pho/upload_rb.html +101 -0
  48. data/doc/rdoc/files/lib/pho_rb.html +4 -2
  49. data/doc/rdoc/fr_class_index.html +4 -0
  50. data/doc/rdoc/fr_file_index.html +1 -0
  51. data/doc/rdoc/fr_method_index.html +228 -218
  52. data/lib/pho.rb +4 -1
  53. data/lib/pho/file_management.rb +30 -1
  54. data/lib/pho/file_manager.rb +7 -0
  55. data/lib/pho/rdf_collection.rb +4 -4
  56. data/lib/pho/upload.rb +156 -0
  57. data/tests/tc_bnodehandler.rb +42 -0
  58. data/tests/tc_filesplitter.rb +51 -0
  59. data/tests/ts_pho.rb +3 -1
  60. metadata +56 -8
data/lib/pho.rb CHANGED
@@ -4,7 +4,9 @@ require 'json'
4
4
  require 'yaml'
5
5
  require 'date'
6
6
  require 'rexml/document'
7
- require 'md5'
7
+ #require 'md5'
8
+ require "digest/md5"
9
+ require "ftools"
8
10
 
9
11
  #RDF.rb
10
12
  require 'rdf'
@@ -31,6 +33,7 @@ require 'pho/enrichment'
31
33
  require 'pho/command_line'
32
34
  require 'pho/oai'
33
35
  require 'pho/converter'
36
+ require 'pho/upload.rb'
34
37
 
35
38
  if RUBY_VERSION < "1.8.7"
36
39
  class String
@@ -1,7 +1,31 @@
1
1
  module Pho
2
2
 
3
+ #This module provides support for managing local directories of files and mirroring them
4
+ #into either the metabox or contentbox of a Talis Platform store
3
5
  module FileManagement
4
6
 
7
+ #Base class for the management of collections of files
8
+ #
9
+ #A collection is considered to be a directory (and sub-directory) of files. Depending on
10
+ #the type of collection manager used (e.g. +Pho::FileManagement::FileManager+ or
11
+ #+Pho::FileManagement::RDFManager+) these may be collections of any kind of file, or just
12
+ #RDF serialisations.
13
+ #
14
+ #The basic mechanism has support for:
15
+ # * tracking whether a file has been successfully loaded into a store (or not)
16
+ # * identifying all new files, just those that have been successfully stored, or failed
17
+ # * identifying files that have been changed locally since last stored
18
+ # * storage of any of these classes of files either by uploading into the contentbox or
19
+ # storing in the metabox
20
+ # * resumable uploads
21
+ # * retrying uploads
22
+ # * traversal across a directory structure to manage a complete set of files
23
+ #
24
+ #Files are tracked locally by keeping some hidden files in the same directory system as the
25
+ #files being tracked. These are currently all managed in a hidden directory (".pho") that is
26
+ #automatically created. To access error messages from uploads look for the corresponding "fail file"
27
+ #for the relevant file. E.g. /foo/file.txt will be tracked in /foo/.pho, and will have either an
28
+ #OK file (/foo/.pho/file.txt.ok) or a fail file (/foo/.pho/file.txt.fail).
5
29
  class AbstractFileManager
6
30
 
7
31
  attr_reader :dir
@@ -162,7 +186,12 @@ module Pho
162
186
  relative_path = relative_path.gsub(base, "#{TRACKING_DIR}/#{base}")
163
187
  return "#{@dir}#{relative_path}.#{@ok_suffix}"
164
188
  end
165
-
189
+
190
+ def create_tracking_dir(filename)
191
+ path = filename.split("/")[0..-2].join("/")
192
+ Dir.mkdir("#{path}/#{TRACKING_DIR}") unless File.exists?("#{path}/#{TRACKING_DIR}")
193
+ end
194
+
166
195
  end
167
196
 
168
197
  end
@@ -4,6 +4,13 @@ module Pho
4
4
 
5
5
  module FileManagement
6
6
 
7
+ #Manages a collection of files in a directory structure, uploading them to
8
+ #the contentbox of a platform store
9
+ #
10
+ #Allows the base directory of the upload to be specified, giving some flexibility
11
+ #on how the files are published via the contentbox. By default the files will
12
+ #be rooted in /items/ but by specifying the base parameter in the construct, this
13
+ #can be altered to, e.g. /items/foo
7
14
  class FileManager < AbstractFileManager
8
15
 
9
16
  attr_reader :base
@@ -2,10 +2,10 @@ module Pho
2
2
 
3
3
  module FileManagement
4
4
 
5
- # Provides a simple mechanism for managing a directory of RDF/XML documents
6
- # and uploading them to platform store.
5
+ #Provides a simple mechanism for managing a directory of RDF documents
6
+ #and uploading them to platform store.
7
7
  #
8
- # Allows a collection to be mirrored into the platform
8
+ #Currently supports RDF, Turtle and NTriples
9
9
  class RDFManager < AbstractFileManager
10
10
 
11
11
  RDF = "rdf".freeze
@@ -40,7 +40,7 @@ module Pho
40
40
  else
41
41
  response = @store.store_file(file)
42
42
  end
43
-
43
+ create_tracking_dir(filename)
44
44
  if (response.status < 300 )
45
45
  File.open(get_ok_file_for(filename), "w") do |file|
46
46
  file.print( "OK" )
@@ -0,0 +1,156 @@
1
+ module Pho
2
+
3
+ module FileManagement
4
+
5
+ #Default statement handler, does nothing
6
+ class StatementHandler
7
+ def handle(statement)
8
+ return statement
9
+ end
10
+ end
11
+
12
+ #Remove bnodes from the input data by assigning URIs to them
13
+ #
14
+ #This implementation generates a simple hexdigest based on the node id
15
+ #and uses that to construct a uri based on a base uri provided in the
16
+ #constructor
17
+ class BNodeRewritingHandler
18
+ # base:: base uri for URIs generated for blank nodes
19
+ def initialize(base)
20
+ @base = base
21
+ end
22
+
23
+ def handle(statement)
24
+ if !statement.has_blank_nodes?
25
+ return statement
26
+ end
27
+ subject = statement.subject
28
+ if subject.anonymous?
29
+ subject = RDF::URI.new( assign_uri(subject) )
30
+ end
31
+ object = statement.object
32
+ if object.anonymous?
33
+ object = RDF::URI.new( assign_uri(object) )
34
+ end
35
+ return RDF::Statement.new(subject, statement.predicate, object)
36
+ end
37
+
38
+ #FIXME semantics for this is wrong if nodeIds are reused across
39
+ #datasets
40
+ def assign_uri(node)
41
+ return "#{@base}/#{Digest::MD5.hexdigest( node.id )}#self"
42
+ end
43
+
44
+ end
45
+
46
+ #Supports splitting RDF data files into smaller chunks of ntriples
47
+ class FileSplitter
48
+
49
+ attr_reader :dir, :triples, :handler
50
+
51
+ DEFAULT_CHUNK_SIZE = 10000
52
+
53
+ #Create a file splitter instance
54
+ #
55
+ # dir:: temporary directory into which split files should be written
56
+ # triples:: number of triples per split file
57
+ # handler:: statement handler to allow pre-processing of statements
58
+ def initialize(dir="/tmp", triples=DEFAULT_CHUNK_SIZE,
59
+ handler=Pho::FileManagement::StatementHandler.new)
60
+ @dir = dir
61
+ @triples = triples
62
+ @handler = handler
63
+ end
64
+
65
+ #Split a single file, in any parseable RDF format into smaller
66
+ #chunks of ntriples. Chunked files are stored in default temporary
67
+ #directory for this instance
68
+ #
69
+ # filename:: name of the file to split
70
+ # format:: input format, default is :ntriples
71
+ def split_file(filename, format=:ntriples)
72
+
73
+ basename = File.basename(filename, ".#{filename.split(".").last}")
74
+ count = 0
75
+ stmts = []
76
+ RDF::Reader.for(format).new(File.new(filename)) do |reader|
77
+ reader.each_statement do |statement|
78
+ count += 1
79
+ stmts << @handler.handle( statement )
80
+ if count % @triples == 0
81
+ RDF::Writer.open( File.join(@dir, "#{basename}_#{count}.nt") ) do |writer|
82
+ stmts.each do |s|
83
+ writer << s
84
+ end
85
+ end
86
+ stmts = []
87
+ end
88
+ end
89
+ end
90
+ if !stmts.empty?
91
+ RDF::Writer.open( File.join(@dir, "#{basename}_#{count}.nt") ) do |writer|
92
+ stmts.each do |s|
93
+ writer << s
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+ #Split a list of files into smaller chunks
100
+ #
101
+ # list_of_filenames:: array of filenames
102
+ # format:: format of the files, default is :ntriples
103
+ def split_files(list_of_filenames, format=:ntriples)
104
+ list_of_filenames.each do |name|
105
+ split_file(name, format)
106
+ end
107
+ end
108
+
109
+ end
110
+
111
+ class Util
112
+
113
+ #Take a directory of files, copy them to temporary directory, splitting
114
+ #where necessary, in preparation for uploading to a platform store.
115
+ #
116
+ #Source directory is scanned for ntriple, turtle and RDF/XML files. All of
117
+ #these are automatically chunked into 10,000 triple chunks and re-serialized
118
+ #as ntriples
119
+ #
120
+ #BNodes are automatically re-written to full uris.
121
+ #
122
+ # store:: Pho::Store into which data will be posted. Used to normalizing bnodes
123
+ # src_dir:: directory containing source data.
124
+ def Util.prepare_platform_upload(store, src_dir, collection_dir,
125
+ triples=FileSplitter::DEFAULT_CHUNK_SIZE)
126
+
127
+ handler = BNodeRewritingHandler.new( store.build_uri("/items") )
128
+ splitter = FileSplitter.new(collection_dir, triples, handler )
129
+
130
+ formats = [ ["*.rdf", :rdfxml], ["*.nt", :ntriples], ["*.ttl", :turtle] ]
131
+ formats.each do |format|
132
+
133
+ files = Dir.glob( File.join(src_dir, format[0] ) )
134
+ splitter.split_files(files, format[1] )
135
+
136
+ end
137
+ return true
138
+
139
+ end
140
+
141
+ #Prepares a batch of files for uploading into the platform, then posts
142
+ #that collection to the designated store
143
+ #
144
+ #Returns an RDFManager instance that can be inspected to check for successes
145
+ def Util.prepare_and_store_upload(store, src_dir, collection_dir,
146
+ triples=FileSplitter::DEFAULT_CHUNK_SIZE)
147
+
148
+ prepare_platform_upload(store, src_dir, collection_dir)
149
+ collection = Pho::FileManagement::RDFManager.new(store, collection_dir)
150
+ collection.store()
151
+ return collection
152
+ end
153
+
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,42 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
2
+ require 'pho'
3
+ require 'test/unit'
4
+ require 'mocha'
5
+
6
+ class BNodeRewritingHandlerTest < Test::Unit::TestCase
7
+
8
+ def test_uris_are_not_changed()
9
+ handler = Pho::FileManagement::BNodeRewritingHandler.new("http://example.org")
10
+
11
+ subject = RDF::URI.new("http://example.org/subject")
12
+ object = RDF::URI.new("http://example.org/object")
13
+
14
+ statement = RDF::Statement.new(subject, RDF::RDFS.label, object)
15
+
16
+ handled = handler.handle(statement)
17
+
18
+ assert_equal( statement.subject, handled.subject)
19
+ assert_equal( statement.predicate, handled.predicate)
20
+ assert_equal( statement.object, handled.object)
21
+
22
+ end
23
+
24
+ def test_uri_assignment()
25
+ handler = Pho::FileManagement::BNodeRewritingHandler.new("http://example.org")
26
+
27
+ subject = RDF::Node.new
28
+ object = RDF::Node.new
29
+
30
+ statement = RDF::Statement.new(subject, RDF::RDFS.label, object)
31
+ handled = handler.handle(statement)
32
+
33
+ assert_equal(statement.predicate, handled.predicate)
34
+ assert_equal(false, handled.subject.anonymous?)
35
+ assert_equal(false, handled.object.anonymous?)
36
+ assert_equal("http://example.org/#{Digest::MD5.hexdigest( subject.id )}#self",
37
+ handled.subject.to_s)
38
+ assert_equal("http://example.org/#{Digest::MD5.hexdigest( object.id )}#self",
39
+ handled.object.to_s)
40
+
41
+ end
42
+ end
@@ -0,0 +1,51 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
2
+ require 'pho'
3
+ require 'test/unit'
4
+ require 'mocha'
5
+
6
+ class FileSplitterTest < Test::Unit::TestCase
7
+
8
+ def setup()
9
+ Dir.mkdir("/tmp/pho") unless File.exists?("/tmp/pho")
10
+ Dir.mkdir("/tmp/pho/split") unless File.exists?("/tmp/pho/split")
11
+ Dir.mkdir("/tmp/pho/split/tmp") unless File.exists?("/tmp/pho/split/tmp")
12
+
13
+ RDF::Writer.open("/tmp/pho/split/large.nt") do |writer|
14
+ 31.times do
15
+ writer << RDF::Statement.new(
16
+ RDF::Resource.new("http://www.example.org"),
17
+ RDF::RDFS.label,
18
+ RDF::Literal.new("This is a test")
19
+ )
20
+ end
21
+ end
22
+
23
+ end
24
+
25
+ def teardown()
26
+ Dir.glob("/tmp/pho/split/tmp/*.*") do |file|
27
+ File.delete(file)
28
+ end
29
+ Dir.glob("/tmp/pho/split/*.*") do |file|
30
+ File.delete(file)
31
+ end
32
+ delete("/tmp/pho/split/tmp")
33
+ delete("/tmp/pho/split")
34
+ end
35
+
36
+ def delete(dir)
37
+ Dir.delete(dir) if File.exists?(dir)
38
+ end
39
+
40
+ def test_split_file()
41
+ splitter = Pho::FileManagement::FileSplitter.new("/tmp/pho/split/tmp", 10)
42
+ splitter.split_file("/tmp/pho/split/large.nt")
43
+ assert_equal(4, Dir.glob("/tmp/pho/split/tmp/large*.nt").size )
44
+ assert_equal(true, File.exists?("/tmp/pho/split/tmp/large_10.nt"))
45
+ assert_equal(true, File.exists?("/tmp/pho/split/tmp/large_20.nt"))
46
+ assert_equal(true, File.exists?("/tmp/pho/split/tmp/large_30.nt"))
47
+ assert_equal(true, File.exists?("/tmp/pho/split/tmp/large_31.nt"))
48
+
49
+ end
50
+
51
+ end
@@ -23,4 +23,6 @@ require 'tc_changesets.rb'
23
23
  require 'tc_converter.rb'
24
24
  require 'tc_enrichment.rb'
25
25
  require 'tc_command_line.rb'
26
- require 'tc_oai.rb'
26
+ require 'tc_oai.rb'
27
+ require 'tc_filesplitter.rb'
28
+ require 'tc_upload.rb'
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pho
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease: false
4
+ hash: 17
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 7
9
- - 8
10
- version: 0.7.8
9
+ - 9
10
+ version: 0.7.9
11
11
  platform: ruby
12
12
  authors:
13
13
  - Leigh Dodds
@@ -15,8 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-10-20 00:00:00 +01:00
19
- default_executable:
18
+ date: 2011-09-21 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  name: httpclient
@@ -82,6 +81,48 @@ dependencies:
82
81
  version: "1.16"
83
82
  type: :runtime
84
83
  version_requirements: *id004
84
+ - !ruby/object:Gem::Dependency
85
+ name: rdf
86
+ prerelease: false
87
+ requirement: &id005 !ruby/object:Gem::Requirement
88
+ none: false
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ hash: 3
93
+ segments:
94
+ - 0
95
+ version: "0"
96
+ type: :runtime
97
+ version_requirements: *id005
98
+ - !ruby/object:Gem::Dependency
99
+ name: rdf-json
100
+ prerelease: false
101
+ requirement: &id006 !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ hash: 3
107
+ segments:
108
+ - 0
109
+ version: "0"
110
+ type: :runtime
111
+ version_requirements: *id006
112
+ - !ruby/object:Gem::Dependency
113
+ name: rdf-raptor
114
+ prerelease: false
115
+ requirement: &id007 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ hash: 3
121
+ segments:
122
+ - 0
123
+ version: "0"
124
+ type: :runtime
125
+ version_requirements: *id007
85
126
  description: Ruby client for the Talis Platform
86
127
  email: leigh.dodds@talis.com
87
128
  executables:
@@ -104,6 +145,7 @@ files:
104
145
  - doc/rdoc/files/lib/pho/store_rb.html
105
146
  - doc/rdoc/files/lib/pho/status_rb.html
106
147
  - doc/rdoc/files/lib/pho/resource_hash_rb.html
148
+ - doc/rdoc/files/lib/pho/upload_rb.html
107
149
  - doc/rdoc/files/lib/pho/field_predicate_map_rb.html
108
150
  - doc/rdoc/files/lib/pho/job_rb.html
109
151
  - doc/rdoc/files/lib/pho/command_line_rb.html
@@ -149,8 +191,12 @@ files:
149
191
  - doc/rdoc/classes/Pho/Update/ResourceStatement.html
150
192
  - doc/rdoc/classes/Pho/Store.html
151
193
  - doc/rdoc/classes/Pho/FileManagement/RDFManager.html
194
+ - doc/rdoc/classes/Pho/FileManagement/BNodeRewritingHandler.html
152
195
  - doc/rdoc/classes/Pho/FileManagement/AbstractFileManager.html
196
+ - doc/rdoc/classes/Pho/FileManagement/StatementHandler.html
153
197
  - doc/rdoc/classes/Pho/FileManagement/FileManager.html
198
+ - doc/rdoc/classes/Pho/FileManagement/FileSplitter.html
199
+ - doc/rdoc/classes/Pho/FileManagement/Util.html
154
200
  - doc/rdoc/classes/Pho/QueryProfile.html
155
201
  - doc/rdoc/classes/Pho/JobUpdate.html
156
202
  - doc/rdoc/classes/Pho/ResourceHash.html
@@ -185,9 +231,11 @@ files:
185
231
  - tests/tc_facet.rb
186
232
  - tests/tc_metabox.rb
187
233
  - tests/tc_jobcontrol.rb
234
+ - tests/tc_filesplitter.rb
188
235
  - tests/ts_pho.rb
189
236
  - tests/tc_contentbox.rb
190
237
  - tests/tc_changeset_builder.rb
238
+ - tests/tc_bnodehandler.rb
191
239
  - tests/tc_rdf_collection.rb
192
240
  - tests/tc_converter.rb
193
241
  - tests/tc_field_predicate_map.rb
@@ -222,11 +270,11 @@ files:
222
270
  - lib/pho/changeset.rb
223
271
  - lib/pho/query_profile.rb
224
272
  - lib/pho/facet.rb
273
+ - lib/pho/upload.rb
225
274
  - lib/pho/enrichment.rb
226
275
  - lib/pho/etags.rb
227
276
  - lib/pho/field_predicate_map.rb
228
277
  - lib/pho.rb
229
- has_rdoc: true
230
278
  homepage: http://pho.rubyforge.net
231
279
  licenses: []
232
280
 
@@ -262,7 +310,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
262
310
  requirements: []
263
311
 
264
312
  rubyforge_project: pho
265
- rubygems_version: 1.3.7
313
+ rubygems_version: 1.8.9
266
314
  signing_key:
267
315
  specification_version: 3
268
316
  summary: Ruby client for the Talis Platform