orchard 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,26 @@
1
+ Copyright (c) 2010, Regents of the University of California
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+ * Redistributions in binary form must reproduce the above copyright
10
+ notice, this list of conditions and the following disclaimer in the
11
+ documentation and/or other materials provided with the distribution.
12
+ * Neither the name of the University of California nor the names of
13
+ its contributors may be used to endorse or promote products derived
14
+ from this software without specific prior written permission.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
17
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
18
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OF THE UNIVERSITY
20
+ OF CALIFORNIA BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,21 @@
1
+ Orchard is a Ruby library for working with Pairtrees, a filesystem hierarchy
2
+ mapping identifiers to object directories.
3
+
4
+ More information can be found at:
5
+
6
+ Pairtrees for Object Storage
7
+ https://confluence.ucop.edu/display/Curation/PairTree
8
+
9
+ ==== Usage Examples
10
+
11
+ Pairtree.encode('ark:/13030/xt12t3')
12
+ # => ark+=13030=xt12t3
13
+
14
+ Pairtree.decode('ark+=13030=xt12t3')
15
+ # => ark:/13030/xt12t3
16
+
17
+ Pairtree.id_to_ppath('ark:/13030/xt12t3')
18
+ # => ar/k+/=1/30/30/=x/t1/2t/3
19
+
20
+ Pairtree.ppath_to_id('ar/k+/=1/30/30/=x/t1/2t/3')
21
+ # => ark:/13030/xt12t3
@@ -0,0 +1,29 @@
1
+ require 'orchard/pairtree'
2
+ require 'orchard/version'
3
+
4
+ # Orchard is a Ruby library for working with Pairtrees, a filesystem hierarchy
5
+ # mapping identifiers to object directories.
6
+ #
7
+ # More information can be found at:
8
+ #
9
+ # Pairtrees for Object Storage
10
+ # https://confluence.ucop.edu/display/Curation/PairTree
11
+ #
12
+ # ==== Usage Examples
13
+ #
14
+ # Pairtree.encode('ark:/13030/xt12t3')
15
+ # # => ark+=13030=xt12t3
16
+ #
17
+ # Pairtree.decode('ark+=13030=xt12t3')
18
+ # # => ark:/13030/xt12t3
19
+ #
20
+ # Pairtree.id_to_ppath('ark:/13030/xt12t3')
21
+ # # => ar/k+/=1/30/30/=x/t1/2t/3
22
+ #
23
+ # Pairtree.ppath_to_id('ar/k+/=1/30/30/=x/t1/2t/3')
24
+ # # => ark:/13030/xt12t3
25
+
26
+ module Orchard
27
+ class InvalidPPathError < StandardError
28
+ end
29
+ end
@@ -0,0 +1,175 @@
1
+ module Orchard
2
+ # Provides a set of methods for working with Pairtree paths.
3
+ class Pairtree
4
+ MAX_SHORTY = 2
5
+ ENCODE_REGEX = /[\"*+,<=>?\\^|]|[^\x21-\x7e]/u
6
+ DECODE_REGEX = /\^(..)|(.)/u
7
+ PPATH_REGEX = /^(?:pairtree_root\/)?((?>[^:\/\.|.]{2}\/)*[^:\/\.|.]{1,2})(?:\/?$)/
8
+ CHAR_ENCODE_CONV = {'/'=>'=',':'=>'+','.'=>','}
9
+ CHAR_DECODE_CONV = {'='=>'/','+'=>':',','=>'.'}
10
+
11
+ # Encodes a given +id+ <em>(String)</em> according to the "identifier string
12
+ # cleaning" in the pairtree 0.1 specification.
13
+ #
14
+ # encode(id)
15
+ #
16
+ # ==== Examples
17
+ #
18
+ # Pairtree.encode('ark:/13030/xt12t3')
19
+ # # => ark+=13030=xt12t3
20
+ #
21
+ # Pairtree.encode('http://n2t.info/urn:nbn:se:kb:repos-1')
22
+ # # => http+==n2t,info=urn+nbn+se+kb+repos-1
23
+ #
24
+ # Pairtree.encode('what-the-*@?#!^!?')
25
+ # # => what-the-^2a@^3f#!^5e!^3f
26
+ #
27
+ # ==== Explanation (From Pairtree 0.1 Specification)
28
+ #
29
+ # Identifier string cleaning
30
+ #
31
+ # Prior to splitting into character pairs, identifier strings are cleaned in
32
+ # two separate steps. One step would be simpler, but pairtree is designed so
33
+ # that commonly used characters in reasonably opaque identifiers (e.g., not
34
+ # containing natural language words, phrases, or hints) result in reasonably
35
+ # short and familiar-looking paths. For completeness, the pairtree algorithm
36
+ # specifies what to do with all possible UTF-8 characters, and relies for this
37
+ # on a kind of URL hex-encoding. To avoid conflict with URLs, pairtree
38
+ # hex-encoding is introduced with the '^' character instead of '%'.
39
+ #
40
+ # First, the identifier string is cleaned of characters that are expected to
41
+ # occur rarely in object identifiers but that would cause certain known
42
+ # problems for file systems. In this step, every UTF-8 octet outside the range
43
+ # of visible ASCII (94 characters with hexadecimal codes 21-7e) [ASCII], as
44
+ # well as the following visible ASCII characters, must be converted to
45
+ # their corresponding 3-character hexadecimal encoding, ^hh, where ^ is a
46
+ # circumflex and hh is two hex digits. For example, ' ' (space) is converted
47
+ # to ^20 and '*' to ^2a. In the second step, the following single-character to
48
+ # single-character conversions must be done. These are characters that occur
49
+ # quite commonly in opaque identifiers but present special problems for
50
+ # filesystems. This step avoids requiring them to be hex encoded (hence
51
+ # expanded to three characters), which keeps the typical ppath reasonably
52
+ # short. Here are examples of identifier strings after cleaning and after
53
+ # ppath mapping.
54
+ #
55
+ def self.encode(id)
56
+ #first pass
57
+ first_pass_id = id.gsub(ENCODE_REGEX) { |m| m.bytes.map{|b| "^%02x"%b }.join}
58
+
59
+ # second pass
60
+ second_pass_id = first_pass_id.split(//).collect { |char| CHAR_ENCODE_CONV[char] || char}.join
61
+ end
62
+
63
+ # Decodes a given +id+ <em>(String)</em>according to the pairtree 0.1 specifiaation.
64
+ #
65
+ # encode(id)
66
+ #
67
+ # ==== Examples
68
+ #
69
+ # Pairtree.decode('ark+=13030=xt12t3')
70
+ # # => ark:/13030/xt12t3
71
+ #
72
+ # Pairtree.decode('http+==n2t,info=urn+nbn+se+kb+repos-1')
73
+ # # => http://n2t.info/urn:nbn:se:kb:repos-1
74
+ #
75
+ # Pairtree.decode('what-the-^2a@^3f#!^5e!^3f')
76
+ # # => what-the-*@?#!^!?
77
+ #
78
+ def self.decode(id)
79
+ # first pass (reverse second from encode)
80
+ first_pass_id = id.split(//).collect { |char| CHAR_DECODE_CONV[char] || char}.join
81
+
82
+ # second pass (reverse first from encode)
83
+ second_pass_id = first_pass_id.scan(DECODE_REGEX).map {|coded,chr| coded.nil? ? chr.ord : coded.hex}.pack('C*').force_encoding('utf-8')
84
+ end
85
+
86
+ # Constructs the pairpath for a given +id+ <em>(String)</em> and +options+.
87
+ #
88
+ # id_to_ppath(id, options = {})
89
+ #
90
+ # ==== Options
91
+ # * <tt>:prefix => Pairtree prefix</tt> - This will remove the prefix from the id
92
+ # before creating a pairpath.
93
+ #
94
+ # ==== Examples
95
+ #
96
+ # Pairtree.id_to_ppath('abcde')
97
+ # # => ab/cd/e
98
+ #
99
+ # or with the prefix option
100
+ #
101
+ # Pairtree.id_to_ppath('http://dom.org/abcde', :prefix => 'http://dom.org/')
102
+ # # => ab/cd/e
103
+ #
104
+ # ==== Explanation (From Pairtree 0.1 Specification) The basic pairtree algorithm
105
+ #
106
+ # The pairtree algorithm maps an arbitrary UTF-8 [RFC3629] encoded identifier
107
+ # string into a filesystem directory path based on successive pairs of
108
+ # characters, and also defines the reverse mapping (from pathname to
109
+ # identifier).
110
+ #
111
+ # In this document the word "directory" is used interchangeably with the word
112
+ # "folder" and all examples conform to Unix-based filesystem conventions which
113
+ # should tranlate easily to Windows conventions after substituting the path
114
+ # separator ('\' instead of '/'). Pairtree places no limitations on file and
115
+ # pathlengths, so implementors thinking about maximal interoperation may
116
+ # wish to consider the issues listed in the Interoperability section of
117
+ # this document.
118
+ #
119
+ # The mapping from identifier string to path has two parts. First, the string
120
+ # is cleaned by converting characters that would be illegal or especially
121
+ # problemmaticin Unix or Windows filesystems. The cleaned string is then
122
+ # split into pairs of characters, each of which becomes a directory name
123
+ # in a filesystem path: successive pairs map to successive path components
124
+ # until there are no characters left, with the last component being either
125
+ # a 1- or 2-character directory name. The resulting path is known as
126
+ # a pairpath, or ppath.
127
+ #
128
+ # abcd -> ab/cd/
129
+ # abcdefg -> ab/cd/ef/g/
130
+ # 12-986xy4 -> 12/-9/86/xy/4/
131
+ #
132
+ def self.id_to_ppath(*args)
133
+ id = args[0]
134
+ options = args[1] || {}
135
+ id.sub!(/^#{options[:prefix]}/,'') unless options[:prefix].nil?
136
+ self.string_to_dirpath(self.encode(id), MAX_SHORTY)
137
+ end
138
+
139
+ # Reconstructs the id for a given +pairpath+ and +options+.
140
+ #
141
+ # ppath_to_id(id, options = {})
142
+ # # id is a String
143
+ #
144
+ # ==== Options
145
+ # * <tt>:prefix => Pairtree prefix</tt> - This will remove the prefix from the id
146
+ # before creating a pairpath.
147
+ #
148
+ # ==== Examples
149
+ #
150
+ # Pairtree.ppath_to_id('ab/cd/e')
151
+ # # => abcde
152
+ #
153
+ # or with the prefix option
154
+ #
155
+ # Pairtree.ppath_to_id('ab/cd/e', :prefix => 'http://dom.org/')
156
+ # # => http://dom.org/abcde
157
+ #
158
+ def self.ppath_to_id(*args)
159
+ ppath = args[0]
160
+ options = args[1] || {}
161
+ match = ppath.match(PPATH_REGEX)
162
+ if match.nil?
163
+ throw InvalidPPathError
164
+ end
165
+ id = self.decode(match[1].delete('/'))
166
+ options[:prefix].nil? ? id : options[:prefix] + id
167
+ end
168
+
169
+ private
170
+ # Internal - split a string into a directory path by shorty length.
171
+ def self.string_to_dirpath(s, dir_length_max)
172
+ s.gsub(/(.{#{dir_length_max}}|.{1,#{dir_length_max}}$)/) { |m| $1.nil? ? m : m + '/' }
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,3 @@
1
+ module Orchard
2
+ VERSION = '0.1'
3
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: orchard
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ version: "0.1"
9
+ platform: ruby
10
+ authors:
11
+ - Stephanie Collett
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+
16
+ date: 2010-12-17 00:00:00 -08:00
17
+ default_executable:
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: thoughtbot-shoulda
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :development
31
+ version_requirements: *id001
32
+ description: Orchard translates id strings to/from Pairtree paths for use with Pairtree file repositories.
33
+ email:
34
+ - stephanie.collett@ucop.edu
35
+ executables: []
36
+
37
+ extensions: []
38
+
39
+ extra_rdoc_files: []
40
+
41
+ files:
42
+ - lib/orchard/pairtree.rb
43
+ - lib/orchard/version.rb
44
+ - lib/orchard.rb
45
+ - LICENSE
46
+ - README.md
47
+ has_rdoc: true
48
+ homepage:
49
+ licenses: []
50
+
51
+ post_install_message:
52
+ rdoc_options: []
53
+
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 1
71
+ - 3
72
+ - 6
73
+ version: 1.3.6
74
+ requirements: []
75
+
76
+ rubyforge_project:
77
+ rubygems_version: 1.3.7
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: Pairtree implmentation for Ruby
81
+ test_files: []
82
+