orchard 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,26 @@
1
+ Copyright (c) 2010, Regents of the University of California
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+ * Redistributions in binary form must reproduce the above copyright
10
+ notice, this list of conditions and the following disclaimer in the
11
+ documentation and/or other materials provided with the distribution.
12
+ * Neither the name of the University of California nor the names of
13
+ its contributors may be used to endorse or promote products derived
14
+ from this software without specific prior written permission.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
17
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
18
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OF THE UNIVERSITY
20
+ OF CALIFORNIA BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,21 @@
1
+ Orchard is a Ruby library for working with Pairtrees, a filesystem hierarchy
2
+ mapping identifiers to object directories.
3
+
4
+ More information can be found at:
5
+
6
+ Pairtrees for Object Storage
7
+ https://confluence.ucop.edu/display/Curation/PairTree
8
+
9
+ ==== Usage Examples
10
+
11
+ Pairtree.encode('ark:/13030/xt12t3')
12
+ # => ark+=13030=xt12t3
13
+
14
+ Pairtree.decode('ark+=13030=xt12t3')
15
+ # => ark:/13030/xt12t3
16
+
17
+ Pairtree.id_to_ppath('ark:/13030/xt12t3')
18
+ # => ar/k+/=1/30/30/=x/t1/2t/3
19
+
20
+ Pairtree.ppath_to_id('ar/k+/=1/30/30/=x/t1/2t/3')
21
+ # => ark:/13030/xt12t3
@@ -0,0 +1,29 @@
1
+ require 'orchard/pairtree'
2
+ require 'orchard/version'
3
+
4
+ # Orchard is a Ruby library for working with Pairtrees, a filesystem hierarchy
5
+ # mapping identifiers to object directories.
6
+ #
7
+ # More information can be found at:
8
+ #
9
+ # Pairtrees for Object Storage
10
+ # https://confluence.ucop.edu/display/Curation/PairTree
11
+ #
12
+ # ==== Usage Examples
13
+ #
14
+ # Pairtree.encode('ark:/13030/xt12t3')
15
+ # # => ark+=13030=xt12t3
16
+ #
17
+ # Pairtree.decode('ark+=13030=xt12t3')
18
+ # # => ark:/13030/xt12t3
19
+ #
20
+ # Pairtree.id_to_ppath('ark:/13030/xt12t3')
21
+ # # => ar/k+/=1/30/30/=x/t1/2t/3
22
+ #
23
+ # Pairtree.ppath_to_id('ar/k+/=1/30/30/=x/t1/2t/3')
24
+ # # => ark:/13030/xt12t3
25
+
26
+ module Orchard
27
+ class InvalidPPathError < StandardError
28
+ end
29
+ end
@@ -0,0 +1,175 @@
1
+ module Orchard
2
+ # Provides a set of methods for working with Pairtree paths.
3
+ class Pairtree
4
+ MAX_SHORTY = 2
5
+ ENCODE_REGEX = /[\"*+,<=>?\\^|]|[^\x21-\x7e]/u
6
+ DECODE_REGEX = /\^(..)|(.)/u
7
+ PPATH_REGEX = /^(?:pairtree_root\/)?((?>[^:\/\.|.]{2}\/)*[^:\/\.|.]{1,2})(?:\/?$)/
8
+ CHAR_ENCODE_CONV = {'/'=>'=',':'=>'+','.'=>','}
9
+ CHAR_DECODE_CONV = {'='=>'/','+'=>':',','=>'.'}
10
+
11
+ # Encodes a given +id+ <em>(String)</em> according to the "identifier string
12
+ # cleaning" in the pairtree 0.1 specification.
13
+ #
14
+ # encode(id)
15
+ #
16
+ # ==== Examples
17
+ #
18
+ # Pairtree.encode('ark:/13030/xt12t3')
19
+ # # => ark+=13030=xt12t3
20
+ #
21
+ # Pairtree.encode('http://n2t.info/urn:nbn:se:kb:repos-1')
22
+ # # => http+==n2t,info=urn+nbn+se+kb+repos-1
23
+ #
24
+ # Pairtree.encode('what-the-*@?#!^!?')
25
+ # # => what-the-^2a@^3f#!^5e!^3f
26
+ #
27
+ # ==== Explanation (From Pairtree 0.1 Specification)
28
+ #
29
+ # Identifier string cleaning
30
+ #
31
+ # Prior to splitting into character pairs, identifier strings are cleaned in
32
+ # two separate steps. One step would be simpler, but pairtree is designed so
33
+ # that commonly used characters in reasonably opaque identifiers (e.g., not
34
+ # containing natural language words, phrases, or hints) result in reasonably
35
+ # short and familiar-looking paths. For completeness, the pairtree algorithm
36
+ # specifies what to do with all possible UTF-8 characters, and relies for this
37
+ # on a kind of URL hex-encoding. To avoid conflict with URLs, pairtree
38
+ # hex-encoding is introduced with the '^' character instead of '%'.
39
+ #
40
+ # First, the identifier string is cleaned of characters that are expected to
41
+ # occur rarely in object identifiers but that would cause certain known
42
+ # problems for file systems. In this step, every UTF-8 octet outside the range
43
+ # of visible ASCII (94 characters with hexadecimal codes 21-7e) [ASCII], as
44
+ # well as the following visible ASCII characters, must be converted to
45
+ # their corresponding 3-character hexadecimal encoding, ^hh, where ^ is a
46
+ # circumflex and hh is two hex digits. For example, ' ' (space) is converted
47
+ # to ^20 and '*' to ^2a. In the second step, the following single-character to
48
+ # single-character conversions must be done. These are characters that occur
49
+ # quite commonly in opaque identifiers but present special problems for
50
+ # filesystems. This step avoids requiring them to be hex encoded (hence
51
+ # expanded to three characters), which keeps the typical ppath reasonably
52
+ # short. Here are examples of identifier strings after cleaning and after
53
+ # ppath mapping.
54
+ #
55
+ def self.encode(id)
56
+ #first pass
57
+ first_pass_id = id.gsub(ENCODE_REGEX) { |m| m.bytes.map{|b| "^%02x"%b }.join}
58
+
59
+ # second pass
60
+ second_pass_id = first_pass_id.split(//).collect { |char| CHAR_ENCODE_CONV[char] || char}.join
61
+ end
62
+
63
+ # Decodes a given +id+ <em>(String)</em>according to the pairtree 0.1 specifiaation.
64
+ #
65
+ # encode(id)
66
+ #
67
+ # ==== Examples
68
+ #
69
+ # Pairtree.decode('ark+=13030=xt12t3')
70
+ # # => ark:/13030/xt12t3
71
+ #
72
+ # Pairtree.decode('http+==n2t,info=urn+nbn+se+kb+repos-1')
73
+ # # => http://n2t.info/urn:nbn:se:kb:repos-1
74
+ #
75
+ # Pairtree.decode('what-the-^2a@^3f#!^5e!^3f')
76
+ # # => what-the-*@?#!^!?
77
+ #
78
+ def self.decode(id)
79
+ # first pass (reverse second from encode)
80
+ first_pass_id = id.split(//).collect { |char| CHAR_DECODE_CONV[char] || char}.join
81
+
82
+ # second pass (reverse first from encode)
83
+ second_pass_id = first_pass_id.scan(DECODE_REGEX).map {|coded,chr| coded.nil? ? chr.ord : coded.hex}.pack('C*').force_encoding('utf-8')
84
+ end
85
+
86
+ # Constructs the pairpath for a given +id+ <em>(String)</em> and +options+.
87
+ #
88
+ # id_to_ppath(id, options = {})
89
+ #
90
+ # ==== Options
91
+ # * <tt>:prefix => Pairtree prefix</tt> - This will remove the prefix from the id
92
+ # before creating a pairpath.
93
+ #
94
+ # ==== Examples
95
+ #
96
+ # Pairtree.id_to_ppath('abcde')
97
+ # # => ab/cd/e
98
+ #
99
+ # or with the prefix option
100
+ #
101
+ # Pairtree.id_to_ppath('http://dom.org/abcde', :prefix => 'http://dom.org/')
102
+ # # => ab/cd/e
103
+ #
104
+ # ==== Explanation (From Pairtree 0.1 Specification) The basic pairtree algorithm
105
+ #
106
+ # The pairtree algorithm maps an arbitrary UTF-8 [RFC3629] encoded identifier
107
+ # string into a filesystem directory path based on successive pairs of
108
+ # characters, and also defines the reverse mapping (from pathname to
109
+ # identifier).
110
+ #
111
+ # In this document the word "directory" is used interchangeably with the word
112
+ # "folder" and all examples conform to Unix-based filesystem conventions which
113
+ # should tranlate easily to Windows conventions after substituting the path
114
+ # separator ('\' instead of '/'). Pairtree places no limitations on file and
115
+ # pathlengths, so implementors thinking about maximal interoperation may
116
+ # wish to consider the issues listed in the Interoperability section of
117
+ # this document.
118
+ #
119
+ # The mapping from identifier string to path has two parts. First, the string
120
+ # is cleaned by converting characters that would be illegal or especially
121
+ # problemmaticin Unix or Windows filesystems. The cleaned string is then
122
+ # split into pairs of characters, each of which becomes a directory name
123
+ # in a filesystem path: successive pairs map to successive path components
124
+ # until there are no characters left, with the last component being either
125
+ # a 1- or 2-character directory name. The resulting path is known as
126
+ # a pairpath, or ppath.
127
+ #
128
+ # abcd -> ab/cd/
129
+ # abcdefg -> ab/cd/ef/g/
130
+ # 12-986xy4 -> 12/-9/86/xy/4/
131
+ #
132
+ def self.id_to_ppath(*args)
133
+ id = args[0]
134
+ options = args[1] || {}
135
+ id.sub!(/^#{options[:prefix]}/,'') unless options[:prefix].nil?
136
+ self.string_to_dirpath(self.encode(id), MAX_SHORTY)
137
+ end
138
+
139
+ # Reconstructs the id for a given +pairpath+ and +options+.
140
+ #
141
+ # ppath_to_id(id, options = {})
142
+ # # id is a String
143
+ #
144
+ # ==== Options
145
+ # * <tt>:prefix => Pairtree prefix</tt> - This will remove the prefix from the id
146
+ # before creating a pairpath.
147
+ #
148
+ # ==== Examples
149
+ #
150
+ # Pairtree.ppath_to_id('ab/cd/e')
151
+ # # => abcde
152
+ #
153
+ # or with the prefix option
154
+ #
155
+ # Pairtree.ppath_to_id('ab/cd/e', :prefix => 'http://dom.org/')
156
+ # # => http://dom.org/abcde
157
+ #
158
+ def self.ppath_to_id(*args)
159
+ ppath = args[0]
160
+ options = args[1] || {}
161
+ match = ppath.match(PPATH_REGEX)
162
+ if match.nil?
163
+ throw InvalidPPathError
164
+ end
165
+ id = self.decode(match[1].delete('/'))
166
+ options[:prefix].nil? ? id : options[:prefix] + id
167
+ end
168
+
169
+ private
170
+ # Internal - split a string into a directory path by shorty length.
171
+ def self.string_to_dirpath(s, dir_length_max)
172
+ s.gsub(/(.{#{dir_length_max}}|.{1,#{dir_length_max}}$)/) { |m| $1.nil? ? m : m + '/' }
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,3 @@
1
+ module Orchard
2
+ VERSION = '0.1'
3
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: orchard
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ version: "0.1"
9
+ platform: ruby
10
+ authors:
11
+ - Stephanie Collett
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+
16
+ date: 2010-12-17 00:00:00 -08:00
17
+ default_executable:
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: thoughtbot-shoulda
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :development
31
+ version_requirements: *id001
32
+ description: Orchard translates id strings to/from Pairtree paths for use with Pairtree file repositories.
33
+ email:
34
+ - stephanie.collett@ucop.edu
35
+ executables: []
36
+
37
+ extensions: []
38
+
39
+ extra_rdoc_files: []
40
+
41
+ files:
42
+ - lib/orchard/pairtree.rb
43
+ - lib/orchard/version.rb
44
+ - lib/orchard.rb
45
+ - LICENSE
46
+ - README.md
47
+ has_rdoc: true
48
+ homepage:
49
+ licenses: []
50
+
51
+ post_install_message:
52
+ rdoc_options: []
53
+
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ segments:
70
+ - 1
71
+ - 3
72
+ - 6
73
+ version: 1.3.6
74
+ requirements: []
75
+
76
+ rubyforge_project:
77
+ rubygems_version: 1.3.7
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: Pairtree implmentation for Ruby
81
+ test_files: []
82
+