orchard 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +26 -0
- data/README.md +21 -0
- data/lib/orchard.rb +29 -0
- data/lib/orchard/pairtree.rb +175 -0
- data/lib/orchard/version.rb +3 -0
- metadata +82 -0
data/LICENSE
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
Copyright (c) 2010, Regents of the University of California
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
* Redistributions of source code must retain the above copyright
|
8
|
+
notice, this list of conditions and the following disclaimer.
|
9
|
+
* Redistributions in binary form must reproduce the above copyright
|
10
|
+
notice, this list of conditions and the following disclaimer in the
|
11
|
+
documentation and/or other materials provided with the distribution.
|
12
|
+
* Neither the name of the University of California nor the names of
|
13
|
+
its contributors may be used to endorse or promote products derived
|
14
|
+
from this software without specific prior written permission.
|
15
|
+
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
17
|
+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
18
|
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
19
|
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OF THE UNIVERSITY
|
20
|
+
OF CALIFORNIA BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
21
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
22
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
23
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
24
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
25
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
26
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Orchard is a Ruby library for working with Pairtrees, a filesystem hierarchy
|
2
|
+
mapping identifiers to object directories.
|
3
|
+
|
4
|
+
More information can be found at:
|
5
|
+
|
6
|
+
Pairtrees for Object Storage
|
7
|
+
https://confluence.ucop.edu/display/Curation/PairTree
|
8
|
+
|
9
|
+
==== Usage Examples
|
10
|
+
|
11
|
+
Pairtree.encode('ark:/13030/xt12t3')
|
12
|
+
# => ark+=13030=xt12t3
|
13
|
+
|
14
|
+
Pairtree.decode('ark+=13030=xt12t3')
|
15
|
+
# => ark:/13030/xt12t3
|
16
|
+
|
17
|
+
Pairtree.id_to_ppath('ark:/13030/xt12t3')
|
18
|
+
# => ar/k+/=1/30/30/=x/t1/2t/3
|
19
|
+
|
20
|
+
Pairtree.ppath_to_id('ar/k+/=1/30/30/=x/t1/2t/3')
|
21
|
+
# => ark:/13030/xt12t3
|
data/lib/orchard.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'orchard/pairtree'
|
2
|
+
require 'orchard/version'
|
3
|
+
|
4
|
+
# Orchard is a Ruby library for working with Pairtrees, a filesystem hierarchy
|
5
|
+
# mapping identifiers to object directories.
|
6
|
+
#
|
7
|
+
# More information can be found at:
|
8
|
+
#
|
9
|
+
# Pairtrees for Object Storage
|
10
|
+
# https://confluence.ucop.edu/display/Curation/PairTree
|
11
|
+
#
|
12
|
+
# ==== Usage Examples
|
13
|
+
#
|
14
|
+
# Pairtree.encode('ark:/13030/xt12t3')
|
15
|
+
# # => ark+=13030=xt12t3
|
16
|
+
#
|
17
|
+
# Pairtree.decode('ark+=13030=xt12t3')
|
18
|
+
# # => ark:/13030/xt12t3
|
19
|
+
#
|
20
|
+
# Pairtree.id_to_ppath('ark:/13030/xt12t3')
|
21
|
+
# # => ar/k+/=1/30/30/=x/t1/2t/3
|
22
|
+
#
|
23
|
+
# Pairtree.ppath_to_id('ar/k+/=1/30/30/=x/t1/2t/3')
|
24
|
+
# # => ark:/13030/xt12t3
|
25
|
+
|
26
|
+
module Orchard
|
27
|
+
class InvalidPPathError < StandardError
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
module Orchard
|
2
|
+
# Provides a set of methods for working with Pairtree paths.
|
3
|
+
class Pairtree
|
4
|
+
MAX_SHORTY = 2
|
5
|
+
ENCODE_REGEX = /[\"*+,<=>?\\^|]|[^\x21-\x7e]/u
|
6
|
+
DECODE_REGEX = /\^(..)|(.)/u
|
7
|
+
PPATH_REGEX = /^(?:pairtree_root\/)?((?>[^:\/\.|.]{2}\/)*[^:\/\.|.]{1,2})(?:\/?$)/
|
8
|
+
CHAR_ENCODE_CONV = {'/'=>'=',':'=>'+','.'=>','}
|
9
|
+
CHAR_DECODE_CONV = {'='=>'/','+'=>':',','=>'.'}
|
10
|
+
|
11
|
+
# Encodes a given +id+ <em>(String)</em> according to the "identifier string
|
12
|
+
# cleaning" in the pairtree 0.1 specification.
|
13
|
+
#
|
14
|
+
# encode(id)
|
15
|
+
#
|
16
|
+
# ==== Examples
|
17
|
+
#
|
18
|
+
# Pairtree.encode('ark:/13030/xt12t3')
|
19
|
+
# # => ark+=13030=xt12t3
|
20
|
+
#
|
21
|
+
# Pairtree.encode('http://n2t.info/urn:nbn:se:kb:repos-1')
|
22
|
+
# # => http+==n2t,info=urn+nbn+se+kb+repos-1
|
23
|
+
#
|
24
|
+
# Pairtree.encode('what-the-*@?#!^!?')
|
25
|
+
# # => what-the-^2a@^3f#!^5e!^3f
|
26
|
+
#
|
27
|
+
# ==== Explanation (From Pairtree 0.1 Specification)
|
28
|
+
#
|
29
|
+
# Identifier string cleaning
|
30
|
+
#
|
31
|
+
# Prior to splitting into character pairs, identifier strings are cleaned in
|
32
|
+
# two separate steps. One step would be simpler, but pairtree is designed so
|
33
|
+
# that commonly used characters in reasonably opaque identifiers (e.g., not
|
34
|
+
# containing natural language words, phrases, or hints) result in reasonably
|
35
|
+
# short and familiar-looking paths. For completeness, the pairtree algorithm
|
36
|
+
# specifies what to do with all possible UTF-8 characters, and relies for this
|
37
|
+
# on a kind of URL hex-encoding. To avoid conflict with URLs, pairtree
|
38
|
+
# hex-encoding is introduced with the '^' character instead of '%'.
|
39
|
+
#
|
40
|
+
# First, the identifier string is cleaned of characters that are expected to
|
41
|
+
# occur rarely in object identifiers but that would cause certain known
|
42
|
+
# problems for file systems. In this step, every UTF-8 octet outside the range
|
43
|
+
# of visible ASCII (94 characters with hexadecimal codes 21-7e) [ASCII], as
|
44
|
+
# well as the following visible ASCII characters, must be converted to
|
45
|
+
# their corresponding 3-character hexadecimal encoding, ^hh, where ^ is a
|
46
|
+
# circumflex and hh is two hex digits. For example, ' ' (space) is converted
|
47
|
+
# to ^20 and '*' to ^2a. In the second step, the following single-character to
|
48
|
+
# single-character conversions must be done. These are characters that occur
|
49
|
+
# quite commonly in opaque identifiers but present special problems for
|
50
|
+
# filesystems. This step avoids requiring them to be hex encoded (hence
|
51
|
+
# expanded to three characters), which keeps the typical ppath reasonably
|
52
|
+
# short. Here are examples of identifier strings after cleaning and after
|
53
|
+
# ppath mapping.
|
54
|
+
#
|
55
|
+
def self.encode(id)
|
56
|
+
#first pass
|
57
|
+
first_pass_id = id.gsub(ENCODE_REGEX) { |m| m.bytes.map{|b| "^%02x"%b }.join}
|
58
|
+
|
59
|
+
# second pass
|
60
|
+
second_pass_id = first_pass_id.split(//).collect { |char| CHAR_ENCODE_CONV[char] || char}.join
|
61
|
+
end
|
62
|
+
|
63
|
+
# Decodes a given +id+ <em>(String)</em>according to the pairtree 0.1 specifiaation.
|
64
|
+
#
|
65
|
+
# encode(id)
|
66
|
+
#
|
67
|
+
# ==== Examples
|
68
|
+
#
|
69
|
+
# Pairtree.decode('ark+=13030=xt12t3')
|
70
|
+
# # => ark:/13030/xt12t3
|
71
|
+
#
|
72
|
+
# Pairtree.decode('http+==n2t,info=urn+nbn+se+kb+repos-1')
|
73
|
+
# # => http://n2t.info/urn:nbn:se:kb:repos-1
|
74
|
+
#
|
75
|
+
# Pairtree.decode('what-the-^2a@^3f#!^5e!^3f')
|
76
|
+
# # => what-the-*@?#!^!?
|
77
|
+
#
|
78
|
+
def self.decode(id)
|
79
|
+
# first pass (reverse second from encode)
|
80
|
+
first_pass_id = id.split(//).collect { |char| CHAR_DECODE_CONV[char] || char}.join
|
81
|
+
|
82
|
+
# second pass (reverse first from encode)
|
83
|
+
second_pass_id = first_pass_id.scan(DECODE_REGEX).map {|coded,chr| coded.nil? ? chr.ord : coded.hex}.pack('C*').force_encoding('utf-8')
|
84
|
+
end
|
85
|
+
|
86
|
+
# Constructs the pairpath for a given +id+ <em>(String)</em> and +options+.
|
87
|
+
#
|
88
|
+
# id_to_ppath(id, options = {})
|
89
|
+
#
|
90
|
+
# ==== Options
|
91
|
+
# * <tt>:prefix => Pairtree prefix</tt> - This will remove the prefix from the id
|
92
|
+
# before creating a pairpath.
|
93
|
+
#
|
94
|
+
# ==== Examples
|
95
|
+
#
|
96
|
+
# Pairtree.id_to_ppath('abcde')
|
97
|
+
# # => ab/cd/e
|
98
|
+
#
|
99
|
+
# or with the prefix option
|
100
|
+
#
|
101
|
+
# Pairtree.id_to_ppath('http://dom.org/abcde', :prefix => 'http://dom.org/')
|
102
|
+
# # => ab/cd/e
|
103
|
+
#
|
104
|
+
# ==== Explanation (From Pairtree 0.1 Specification) The basic pairtree algorithm
|
105
|
+
#
|
106
|
+
# The pairtree algorithm maps an arbitrary UTF-8 [RFC3629] encoded identifier
|
107
|
+
# string into a filesystem directory path based on successive pairs of
|
108
|
+
# characters, and also defines the reverse mapping (from pathname to
|
109
|
+
# identifier).
|
110
|
+
#
|
111
|
+
# In this document the word "directory" is used interchangeably with the word
|
112
|
+
# "folder" and all examples conform to Unix-based filesystem conventions which
|
113
|
+
# should tranlate easily to Windows conventions after substituting the path
|
114
|
+
# separator ('\' instead of '/'). Pairtree places no limitations on file and
|
115
|
+
# pathlengths, so implementors thinking about maximal interoperation may
|
116
|
+
# wish to consider the issues listed in the Interoperability section of
|
117
|
+
# this document.
|
118
|
+
#
|
119
|
+
# The mapping from identifier string to path has two parts. First, the string
|
120
|
+
# is cleaned by converting characters that would be illegal or especially
|
121
|
+
# problemmaticin Unix or Windows filesystems. The cleaned string is then
|
122
|
+
# split into pairs of characters, each of which becomes a directory name
|
123
|
+
# in a filesystem path: successive pairs map to successive path components
|
124
|
+
# until there are no characters left, with the last component being either
|
125
|
+
# a 1- or 2-character directory name. The resulting path is known as
|
126
|
+
# a pairpath, or ppath.
|
127
|
+
#
|
128
|
+
# abcd -> ab/cd/
|
129
|
+
# abcdefg -> ab/cd/ef/g/
|
130
|
+
# 12-986xy4 -> 12/-9/86/xy/4/
|
131
|
+
#
|
132
|
+
def self.id_to_ppath(*args)
|
133
|
+
id = args[0]
|
134
|
+
options = args[1] || {}
|
135
|
+
id.sub!(/^#{options[:prefix]}/,'') unless options[:prefix].nil?
|
136
|
+
self.string_to_dirpath(self.encode(id), MAX_SHORTY)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Reconstructs the id for a given +pairpath+ and +options+.
|
140
|
+
#
|
141
|
+
# ppath_to_id(id, options = {})
|
142
|
+
# # id is a String
|
143
|
+
#
|
144
|
+
# ==== Options
|
145
|
+
# * <tt>:prefix => Pairtree prefix</tt> - This will remove the prefix from the id
|
146
|
+
# before creating a pairpath.
|
147
|
+
#
|
148
|
+
# ==== Examples
|
149
|
+
#
|
150
|
+
# Pairtree.ppath_to_id('ab/cd/e')
|
151
|
+
# # => abcde
|
152
|
+
#
|
153
|
+
# or with the prefix option
|
154
|
+
#
|
155
|
+
# Pairtree.ppath_to_id('ab/cd/e', :prefix => 'http://dom.org/')
|
156
|
+
# # => http://dom.org/abcde
|
157
|
+
#
|
158
|
+
def self.ppath_to_id(*args)
|
159
|
+
ppath = args[0]
|
160
|
+
options = args[1] || {}
|
161
|
+
match = ppath.match(PPATH_REGEX)
|
162
|
+
if match.nil?
|
163
|
+
throw InvalidPPathError
|
164
|
+
end
|
165
|
+
id = self.decode(match[1].delete('/'))
|
166
|
+
options[:prefix].nil? ? id : options[:prefix] + id
|
167
|
+
end
|
168
|
+
|
169
|
+
private
|
170
|
+
# Internal - split a string into a directory path by shorty length.
|
171
|
+
def self.string_to_dirpath(s, dir_length_max)
|
172
|
+
s.gsub(/(.{#{dir_length_max}}|.{1,#{dir_length_max}}$)/) { |m| $1.nil? ? m : m + '/' }
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: orchard
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
version: "0.1"
|
9
|
+
platform: ruby
|
10
|
+
authors:
|
11
|
+
- Stephanie Collett
|
12
|
+
autorequire:
|
13
|
+
bindir: bin
|
14
|
+
cert_chain: []
|
15
|
+
|
16
|
+
date: 2010-12-17 00:00:00 -08:00
|
17
|
+
default_executable:
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: thoughtbot-shoulda
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :development
|
31
|
+
version_requirements: *id001
|
32
|
+
description: Orchard translates id strings to/from Pairtree paths for use with Pairtree file repositories.
|
33
|
+
email:
|
34
|
+
- stephanie.collett@ucop.edu
|
35
|
+
executables: []
|
36
|
+
|
37
|
+
extensions: []
|
38
|
+
|
39
|
+
extra_rdoc_files: []
|
40
|
+
|
41
|
+
files:
|
42
|
+
- lib/orchard/pairtree.rb
|
43
|
+
- lib/orchard/version.rb
|
44
|
+
- lib/orchard.rb
|
45
|
+
- LICENSE
|
46
|
+
- README.md
|
47
|
+
has_rdoc: true
|
48
|
+
homepage:
|
49
|
+
licenses: []
|
50
|
+
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options: []
|
53
|
+
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
segments:
|
70
|
+
- 1
|
71
|
+
- 3
|
72
|
+
- 6
|
73
|
+
version: 1.3.6
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 1.3.7
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: Pairtree implmentation for Ruby
|
81
|
+
test_files: []
|
82
|
+
|