pho 0.7.8 → 0.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/CHANGES +8 -0
  2. data/Rakefile +4 -2
  3. data/doc/rdoc/classes/Pho.html +4 -0
  4. data/doc/rdoc/classes/Pho/CommandLine.html +114 -114
  5. data/doc/rdoc/classes/Pho/DatatypeProperty.html +12 -12
  6. data/doc/rdoc/classes/Pho/Enrichment/StoreEnricher.html +29 -29
  7. data/doc/rdoc/classes/Pho/Etags.html +36 -36
  8. data/doc/rdoc/classes/Pho/Facet/Results.html +19 -19
  9. data/doc/rdoc/classes/Pho/Facet/Term.html +6 -6
  10. data/doc/rdoc/classes/Pho/FieldPredicateMap.html +102 -102
  11. data/doc/rdoc/classes/Pho/FieldWeighting.html +12 -12
  12. data/doc/rdoc/classes/Pho/FileManagement.html +16 -0
  13. data/doc/rdoc/classes/Pho/FileManagement/AbstractFileManager.html +168 -108
  14. data/doc/rdoc/classes/Pho/FileManagement/BNodeRewritingHandler.html +226 -0
  15. data/doc/rdoc/classes/Pho/FileManagement/FileManager.html +48 -35
  16. data/doc/rdoc/classes/Pho/FileManagement/FileSplitter.html +291 -0
  17. data/doc/rdoc/classes/Pho/FileManagement/RDFManager.html +22 -22
  18. data/doc/rdoc/classes/Pho/FileManagement/StatementHandler.html +152 -0
  19. data/doc/rdoc/classes/Pho/FileManagement/Util.html +212 -0
  20. data/doc/rdoc/classes/Pho/Job.html +64 -64
  21. data/doc/rdoc/classes/Pho/Jobs.html +60 -60
  22. data/doc/rdoc/classes/Pho/OAI/Record.html +6 -6
  23. data/doc/rdoc/classes/Pho/OAI/Records.html +24 -24
  24. data/doc/rdoc/classes/Pho/OAI/Statistics.html +12 -12
  25. data/doc/rdoc/classes/Pho/QueryProfile.html +66 -66
  26. data/doc/rdoc/classes/Pho/ResourceHash/Converter.html +36 -36
  27. data/doc/rdoc/classes/Pho/ResourceHash/SetAlgebra.html +12 -12
  28. data/doc/rdoc/classes/Pho/Snapshot.html +35 -35
  29. data/doc/rdoc/classes/Pho/Sparql/SparqlClient.html +74 -74
  30. data/doc/rdoc/classes/Pho/Sparql/SparqlHelper.html +78 -78
  31. data/doc/rdoc/classes/Pho/Status.html +26 -26
  32. data/doc/rdoc/classes/Pho/Store.html +215 -215
  33. data/doc/rdoc/classes/Pho/StoreSparqlClient.html +12 -12
  34. data/doc/rdoc/classes/Pho/Update/Changeset.html +63 -63
  35. data/doc/rdoc/classes/Pho/Update/ChangesetBuilder.html +24 -24
  36. data/doc/rdoc/classes/Pho/Update/ChangesetHelper.html +6 -6
  37. data/doc/rdoc/classes/Pho/Update/Changesets.html +12 -12
  38. data/doc/rdoc/classes/Pho/Update/LiteralStatement.html +18 -18
  39. data/doc/rdoc/classes/Pho/Update/ResourceStatement.html +24 -24
  40. data/doc/rdoc/classes/Pho/Update/Statement.html +24 -24
  41. data/doc/rdoc/classes/String.html +1 -1
  42. data/doc/rdoc/created.rid +1 -1
  43. data/doc/rdoc/files/CHANGES.html +13 -1
  44. data/doc/rdoc/files/lib/pho/file_management_rb.html +1 -1
  45. data/doc/rdoc/files/lib/pho/file_manager_rb.html +1 -1
  46. data/doc/rdoc/files/lib/pho/rdf_collection_rb.html +1 -1
  47. data/doc/rdoc/files/lib/pho/upload_rb.html +101 -0
  48. data/doc/rdoc/files/lib/pho_rb.html +4 -2
  49. data/doc/rdoc/fr_class_index.html +4 -0
  50. data/doc/rdoc/fr_file_index.html +1 -0
  51. data/doc/rdoc/fr_method_index.html +228 -218
  52. data/lib/pho.rb +4 -1
  53. data/lib/pho/file_management.rb +30 -1
  54. data/lib/pho/file_manager.rb +7 -0
  55. data/lib/pho/rdf_collection.rb +4 -4
  56. data/lib/pho/upload.rb +156 -0
  57. data/tests/tc_bnodehandler.rb +42 -0
  58. data/tests/tc_filesplitter.rb +51 -0
  59. data/tests/ts_pho.rb +3 -1
  60. metadata +56 -8
data/lib/pho.rb CHANGED
@@ -4,7 +4,9 @@ require 'json'
4
4
  require 'yaml'
5
5
  require 'date'
6
6
  require 'rexml/document'
7
- require 'md5'
7
+ #require 'md5'
8
+ require "digest/md5"
9
+ require "ftools"
8
10
 
9
11
  #RDF.rb
10
12
  require 'rdf'
@@ -31,6 +33,7 @@ require 'pho/enrichment'
31
33
  require 'pho/command_line'
32
34
  require 'pho/oai'
33
35
  require 'pho/converter'
36
+ require 'pho/upload.rb'
34
37
 
35
38
  if RUBY_VERSION < "1.8.7"
36
39
  class String
@@ -1,7 +1,31 @@
1
1
  module Pho
2
2
 
3
+ #This module provides support for managing local directories of files and mirroring them
4
+ #into either the metabox or contentbox of a Talis Platform store
3
5
  module FileManagement
4
6
 
7
+ #Base class for the management of collections of files
8
+ #
9
+ #A collection is considered to be a directory (and sub-directory) of files. Depending on
10
+ #the type of collection manager used (e.g. +Pho::FileManagement::FileManager+ or
11
+ #+Pho::FileManagement::RDFManager+) these may be collections of any kind of file, or just
12
+ #RDF serialisations.
13
+ #
14
+ #The basic mechanism has support for:
15
+ # * tracking whether a file has been successfully loaded into a store (or not)
16
+ # * identifying all new files, just those that have been successfully stored, or failed
17
+ # * identifying files that have been changed locally since last stored
18
+ # * storage of any of these classes of files either by uploading into the contentbox or
19
+ # storing in the metabox
20
+ # * resumable uploads
21
+ # * retrying uploads
22
+ # * traversal across a directory structure to manage a complete set of files
23
+ #
24
+ #Files are tracked locally by keeping some hidden files in the same directory system as the
25
+ #files being tracked. These are currently all managed in a hidden directory (".pho") that is
26
+ #automatically created. To access error messages from uploads look for the corresponding "fail file"
27
+ #for the relevant file. E.g. /foo/file.txt will be tracked in /foo/.pho, and will have either an
28
+ #OK file (/foo/.pho/file.txt.ok) or a fail file (/foo/.pho/file.txt.fail).
5
29
  class AbstractFileManager
6
30
 
7
31
  attr_reader :dir
@@ -162,7 +186,12 @@ module Pho
162
186
  relative_path = relative_path.gsub(base, "#{TRACKING_DIR}/#{base}")
163
187
  return "#{@dir}#{relative_path}.#{@ok_suffix}"
164
188
  end
165
-
189
+
190
+ def create_tracking_dir(filename)
191
+ path = filename.split("/")[0..-2].join("/")
192
+ Dir.mkdir("#{path}/#{TRACKING_DIR}") unless File.exists?("#{path}/#{TRACKING_DIR}")
193
+ end
194
+
166
195
  end
167
196
 
168
197
  end
@@ -4,6 +4,13 @@ module Pho
4
4
 
5
5
  module FileManagement
6
6
 
7
+ #Manages a collection of files in a directory structure, uploading them to
8
+ #the contentbox of a platform store
9
+ #
10
+ #Allows the base directory of the upload to be specified, giving some flexibility
11
+ #on how the files are published via the contentbox. By default the files will
12
+ #be rooted in /items/ but by specifying the base parameter in the construct, this
13
+ #can be altered to, e.g. /items/foo
7
14
  class FileManager < AbstractFileManager
8
15
 
9
16
  attr_reader :base
@@ -2,10 +2,10 @@ module Pho
2
2
 
3
3
  module FileManagement
4
4
 
5
- # Provides a simple mechanism for managing a directory of RDF/XML documents
6
- # and uploading them to platform store.
5
+ #Provides a simple mechanism for managing a directory of RDF documents
6
+ #and uploading them to platform store.
7
7
  #
8
- # Allows a collection to be mirrored into the platform
8
+ #Currently supports RDF, Turtle and NTriples
9
9
  class RDFManager < AbstractFileManager
10
10
 
11
11
  RDF = "rdf".freeze
@@ -40,7 +40,7 @@ module Pho
40
40
  else
41
41
  response = @store.store_file(file)
42
42
  end
43
-
43
+ create_tracking_dir(filename)
44
44
  if (response.status < 300 )
45
45
  File.open(get_ok_file_for(filename), "w") do |file|
46
46
  file.print( "OK" )
@@ -0,0 +1,156 @@
1
+ module Pho
2
+
3
+ module FileManagement
4
+
5
+ #Default statement handler, does nothing
6
+ class StatementHandler
7
+ def handle(statement)
8
+ return statement
9
+ end
10
+ end
11
+
12
+ #Remove bnodes from the input data by assigning URIs to them
13
+ #
14
+ #This implementation generates a simple hexdigest based on the node id
15
+ #and uses that to construct a uri based on a base uri provided in the
16
+ #constructor
17
+ class BNodeRewritingHandler
18
+ # base:: base uri for URIs generated for blank nodes
19
+ def initialize(base)
20
+ @base = base
21
+ end
22
+
23
+ def handle(statement)
24
+ if !statement.has_blank_nodes?
25
+ return statement
26
+ end
27
+ subject = statement.subject
28
+ if subject.anonymous?
29
+ subject = RDF::URI.new( assign_uri(subject) )
30
+ end
31
+ object = statement.object
32
+ if object.anonymous?
33
+ object = RDF::URI.new( assign_uri(object) )
34
+ end
35
+ return RDF::Statement.new(subject, statement.predicate, object)
36
+ end
37
+
38
+ #FIXME semantics for this is wrong if nodeIds are reused across
39
+ #datasets
40
+ def assign_uri(node)
41
+ return "#{@base}/#{Digest::MD5.hexdigest( node.id )}#self"
42
+ end
43
+
44
+ end
45
+
46
+ #Supports splitting RDF data files into smaller chunks of ntriples
47
+ class FileSplitter
48
+
49
+ attr_reader :dir, :triples, :handler
50
+
51
+ DEFAULT_CHUNK_SIZE = 10000
52
+
53
+ #Create a file splitter instance
54
+ #
55
+ # dir:: temporary directory into which split files should be written
56
+ # triples:: number of triples per split file
57
+ # handler:: statement handler to allow pre-processing of statements
58
+ def initialize(dir="/tmp", triples=DEFAULT_CHUNK_SIZE,
59
+ handler=Pho::FileManagement::StatementHandler.new)
60
+ @dir = dir
61
+ @triples = triples
62
+ @handler = handler
63
+ end
64
+
65
+ #Split a single file, in any parseable RDF format into smaller
66
+ #chunks of ntriples. Chunked files are stored in default temporary
67
+ #directory for this instance
68
+ #
69
+ # filename:: name of the file to split
70
+ # format:: input format, default is :ntriples
71
+ def split_file(filename, format=:ntriples)
72
+
73
+ basename = File.basename(filename, ".#{filename.split(".").last}")
74
+ count = 0
75
+ stmts = []
76
+ RDF::Reader.for(format).new(File.new(filename)) do |reader|
77
+ reader.each_statement do |statement|
78
+ count += 1
79
+ stmts << @handler.handle( statement )
80
+ if count % @triples == 0
81
+ RDF::Writer.open( File.join(@dir, "#{basename}_#{count}.nt") ) do |writer|
82
+ stmts.each do |s|
83
+ writer << s
84
+ end
85
+ end
86
+ stmts = []
87
+ end
88
+ end
89
+ end
90
+ if !stmts.empty?
91
+ RDF::Writer.open( File.join(@dir, "#{basename}_#{count}.nt") ) do |writer|
92
+ stmts.each do |s|
93
+ writer << s
94
+ end
95
+ end
96
+ end
97
+ end
98
+
99
+ #Split a list of files into smaller chunks
100
+ #
101
+ # list_of_filenames:: array of filenames
102
+ # format:: format of the files, default is :ntriples
103
+ def split_files(list_of_filenames, format=:ntriples)
104
+ list_of_filenames.each do |name|
105
+ split_file(name, format)
106
+ end
107
+ end
108
+
109
+ end
110
+
111
+ class Util
112
+
113
+ #Take a directory of files, copy them to temporary directory, splitting
114
+ #where necessary, in preparation for uploading to a platform store.
115
+ #
116
+ #Source directory is scanned for ntriple, turtle and RDF/XML files. All of
117
+ #these are automatically chunked into 10,000 triple chunks and re-serialized
118
+ #as ntriples
119
+ #
120
+ #BNodes are automatically re-written to full uris.
121
+ #
122
+ # store:: Pho::Store into which data will be posted. Used to normalizing bnodes
123
+ # src_dir:: directory containing source data.
124
+ def Util.prepare_platform_upload(store, src_dir, collection_dir,
125
+ triples=FileSplitter::DEFAULT_CHUNK_SIZE)
126
+
127
+ handler = BNodeRewritingHandler.new( store.build_uri("/items") )
128
+ splitter = FileSplitter.new(collection_dir, triples, handler )
129
+
130
+ formats = [ ["*.rdf", :rdfxml], ["*.nt", :ntriples], ["*.ttl", :turtle] ]
131
+ formats.each do |format|
132
+
133
+ files = Dir.glob( File.join(src_dir, format[0] ) )
134
+ splitter.split_files(files, format[1] )
135
+
136
+ end
137
+ return true
138
+
139
+ end
140
+
141
+ #Prepares a batch of files for uploading into the platform, then posts
142
+ #that collection to the designated store
143
+ #
144
+ #Returns an RDFManager instance that can be inspected to check for successes
145
+ def Util.prepare_and_store_upload(store, src_dir, collection_dir,
146
+ triples=FileSplitter::DEFAULT_CHUNK_SIZE)
147
+
148
+ prepare_platform_upload(store, src_dir, collection_dir)
149
+ collection = Pho::FileManagement::RDFManager.new(store, collection_dir)
150
+ collection.store()
151
+ return collection
152
+ end
153
+
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,42 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
2
+ require 'pho'
3
+ require 'test/unit'
4
+ require 'mocha'
5
+
6
+ class BNodeRewritingHandlerTest < Test::Unit::TestCase
7
+
8
+ def test_uris_are_not_changed()
9
+ handler = Pho::FileManagement::BNodeRewritingHandler.new("http://example.org")
10
+
11
+ subject = RDF::URI.new("http://example.org/subject")
12
+ object = RDF::URI.new("http://example.org/object")
13
+
14
+ statement = RDF::Statement.new(subject, RDF::RDFS.label, object)
15
+
16
+ handled = handler.handle(statement)
17
+
18
+ assert_equal( statement.subject, handled.subject)
19
+ assert_equal( statement.predicate, handled.predicate)
20
+ assert_equal( statement.object, handled.object)
21
+
22
+ end
23
+
24
+ def test_uri_assignment()
25
+ handler = Pho::FileManagement::BNodeRewritingHandler.new("http://example.org")
26
+
27
+ subject = RDF::Node.new
28
+ object = RDF::Node.new
29
+
30
+ statement = RDF::Statement.new(subject, RDF::RDFS.label, object)
31
+ handled = handler.handle(statement)
32
+
33
+ assert_equal(statement.predicate, handled.predicate)
34
+ assert_equal(false, handled.subject.anonymous?)
35
+ assert_equal(false, handled.object.anonymous?)
36
+ assert_equal("http://example.org/#{Digest::MD5.hexdigest( subject.id )}#self",
37
+ handled.subject.to_s)
38
+ assert_equal("http://example.org/#{Digest::MD5.hexdigest( object.id )}#self",
39
+ handled.object.to_s)
40
+
41
+ end
42
+ end
@@ -0,0 +1,51 @@
1
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
2
+ require 'pho'
3
+ require 'test/unit'
4
+ require 'mocha'
5
+
6
+ class FileSplitterTest < Test::Unit::TestCase
7
+
8
+ def setup()
9
+ Dir.mkdir("/tmp/pho") unless File.exists?("/tmp/pho")
10
+ Dir.mkdir("/tmp/pho/split") unless File.exists?("/tmp/pho/split")
11
+ Dir.mkdir("/tmp/pho/split/tmp") unless File.exists?("/tmp/pho/split/tmp")
12
+
13
+ RDF::Writer.open("/tmp/pho/split/large.nt") do |writer|
14
+ 31.times do
15
+ writer << RDF::Statement.new(
16
+ RDF::Resource.new("http://www.example.org"),
17
+ RDF::RDFS.label,
18
+ RDF::Literal.new("This is a test")
19
+ )
20
+ end
21
+ end
22
+
23
+ end
24
+
25
+ def teardown()
26
+ Dir.glob("/tmp/pho/split/tmp/*.*") do |file|
27
+ File.delete(file)
28
+ end
29
+ Dir.glob("/tmp/pho/split/*.*") do |file|
30
+ File.delete(file)
31
+ end
32
+ delete("/tmp/pho/split/tmp")
33
+ delete("/tmp/pho/split")
34
+ end
35
+
36
+ def delete(dir)
37
+ Dir.delete(dir) if File.exists?(dir)
38
+ end
39
+
40
+ def test_split_file()
41
+ splitter = Pho::FileManagement::FileSplitter.new("/tmp/pho/split/tmp", 10)
42
+ splitter.split_file("/tmp/pho/split/large.nt")
43
+ assert_equal(4, Dir.glob("/tmp/pho/split/tmp/large*.nt").size )
44
+ assert_equal(true, File.exists?("/tmp/pho/split/tmp/large_10.nt"))
45
+ assert_equal(true, File.exists?("/tmp/pho/split/tmp/large_20.nt"))
46
+ assert_equal(true, File.exists?("/tmp/pho/split/tmp/large_30.nt"))
47
+ assert_equal(true, File.exists?("/tmp/pho/split/tmp/large_31.nt"))
48
+
49
+ end
50
+
51
+ end
@@ -23,4 +23,6 @@ require 'tc_changesets.rb'
23
23
  require 'tc_converter.rb'
24
24
  require 'tc_enrichment.rb'
25
25
  require 'tc_command_line.rb'
26
- require 'tc_oai.rb'
26
+ require 'tc_oai.rb'
27
+ require 'tc_filesplitter.rb'
28
+ require 'tc_upload.rb'
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pho
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease: false
4
+ hash: 17
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 7
9
- - 8
10
- version: 0.7.8
9
+ - 9
10
+ version: 0.7.9
11
11
  platform: ruby
12
12
  authors:
13
13
  - Leigh Dodds
@@ -15,8 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-10-20 00:00:00 +01:00
19
- default_executable:
18
+ date: 2011-09-21 00:00:00 Z
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
22
21
  name: httpclient
@@ -82,6 +81,48 @@ dependencies:
82
81
  version: "1.16"
83
82
  type: :runtime
84
83
  version_requirements: *id004
84
+ - !ruby/object:Gem::Dependency
85
+ name: rdf
86
+ prerelease: false
87
+ requirement: &id005 !ruby/object:Gem::Requirement
88
+ none: false
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ hash: 3
93
+ segments:
94
+ - 0
95
+ version: "0"
96
+ type: :runtime
97
+ version_requirements: *id005
98
+ - !ruby/object:Gem::Dependency
99
+ name: rdf-json
100
+ prerelease: false
101
+ requirement: &id006 !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ hash: 3
107
+ segments:
108
+ - 0
109
+ version: "0"
110
+ type: :runtime
111
+ version_requirements: *id006
112
+ - !ruby/object:Gem::Dependency
113
+ name: rdf-raptor
114
+ prerelease: false
115
+ requirement: &id007 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ hash: 3
121
+ segments:
122
+ - 0
123
+ version: "0"
124
+ type: :runtime
125
+ version_requirements: *id007
85
126
  description: Ruby client for the Talis Platform
86
127
  email: leigh.dodds@talis.com
87
128
  executables:
@@ -104,6 +145,7 @@ files:
104
145
  - doc/rdoc/files/lib/pho/store_rb.html
105
146
  - doc/rdoc/files/lib/pho/status_rb.html
106
147
  - doc/rdoc/files/lib/pho/resource_hash_rb.html
148
+ - doc/rdoc/files/lib/pho/upload_rb.html
107
149
  - doc/rdoc/files/lib/pho/field_predicate_map_rb.html
108
150
  - doc/rdoc/files/lib/pho/job_rb.html
109
151
  - doc/rdoc/files/lib/pho/command_line_rb.html
@@ -149,8 +191,12 @@ files:
149
191
  - doc/rdoc/classes/Pho/Update/ResourceStatement.html
150
192
  - doc/rdoc/classes/Pho/Store.html
151
193
  - doc/rdoc/classes/Pho/FileManagement/RDFManager.html
194
+ - doc/rdoc/classes/Pho/FileManagement/BNodeRewritingHandler.html
152
195
  - doc/rdoc/classes/Pho/FileManagement/AbstractFileManager.html
196
+ - doc/rdoc/classes/Pho/FileManagement/StatementHandler.html
153
197
  - doc/rdoc/classes/Pho/FileManagement/FileManager.html
198
+ - doc/rdoc/classes/Pho/FileManagement/FileSplitter.html
199
+ - doc/rdoc/classes/Pho/FileManagement/Util.html
154
200
  - doc/rdoc/classes/Pho/QueryProfile.html
155
201
  - doc/rdoc/classes/Pho/JobUpdate.html
156
202
  - doc/rdoc/classes/Pho/ResourceHash.html
@@ -185,9 +231,11 @@ files:
185
231
  - tests/tc_facet.rb
186
232
  - tests/tc_metabox.rb
187
233
  - tests/tc_jobcontrol.rb
234
+ - tests/tc_filesplitter.rb
188
235
  - tests/ts_pho.rb
189
236
  - tests/tc_contentbox.rb
190
237
  - tests/tc_changeset_builder.rb
238
+ - tests/tc_bnodehandler.rb
191
239
  - tests/tc_rdf_collection.rb
192
240
  - tests/tc_converter.rb
193
241
  - tests/tc_field_predicate_map.rb
@@ -222,11 +270,11 @@ files:
222
270
  - lib/pho/changeset.rb
223
271
  - lib/pho/query_profile.rb
224
272
  - lib/pho/facet.rb
273
+ - lib/pho/upload.rb
225
274
  - lib/pho/enrichment.rb
226
275
  - lib/pho/etags.rb
227
276
  - lib/pho/field_predicate_map.rb
228
277
  - lib/pho.rb
229
- has_rdoc: true
230
278
  homepage: http://pho.rubyforge.net
231
279
  licenses: []
232
280
 
@@ -262,7 +310,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
262
310
  requirements: []
263
311
 
264
312
  rubyforge_project: pho
265
- rubygems_version: 1.3.7
313
+ rubygems_version: 1.8.9
266
314
  signing_key:
267
315
  specification_version: 3
268
316
  summary: Ruby client for the Talis Platform