orchard 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +26 -0
- data/README.md +21 -0
- data/lib/orchard.rb +29 -0
- data/lib/orchard/pairtree.rb +175 -0
- data/lib/orchard/version.rb +3 -0
- metadata +82 -0
data/LICENSE
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
Copyright (c) 2010, Regents of the University of California
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
* Redistributions of source code must retain the above copyright
|
8
|
+
notice, this list of conditions and the following disclaimer.
|
9
|
+
* Redistributions in binary form must reproduce the above copyright
|
10
|
+
notice, this list of conditions and the following disclaimer in the
|
11
|
+
documentation and/or other materials provided with the distribution.
|
12
|
+
* Neither the name of the University of California nor the names of
|
13
|
+
its contributors may be used to endorse or promote products derived
|
14
|
+
from this software without specific prior written permission.
|
15
|
+
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
17
|
+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
18
|
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
19
|
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OF THE UNIVERSITY
|
20
|
+
OF CALIFORNIA BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
21
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
22
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
23
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
24
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
25
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
26
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Orchard is a Ruby library for working with Pairtrees, a filesystem hierarchy
|
2
|
+
mapping identifiers to object directories.
|
3
|
+
|
4
|
+
More information can be found at:
|
5
|
+
|
6
|
+
Pairtrees for Object Storage
|
7
|
+
https://confluence.ucop.edu/display/Curation/PairTree
|
8
|
+
|
9
|
+
==== Usage Examples
|
10
|
+
|
11
|
+
Pairtree.encode('ark:/13030/xt12t3')
|
12
|
+
# => ark+=13030=xt12t3
|
13
|
+
|
14
|
+
Pairtree.decode('ark+=13030=xt12t3')
|
15
|
+
# => ark:/13030/xt12t3
|
16
|
+
|
17
|
+
Pairtree.id_to_ppath('ark:/13030/xt12t3')
|
18
|
+
# => ar/k+/=1/30/30/=x/t1/2t/3
|
19
|
+
|
20
|
+
Pairtree.ppath_to_id('ar/k+/=1/30/30/=x/t1/2t/3')
|
21
|
+
# => ark:/13030/xt12t3
|
data/lib/orchard.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'orchard/pairtree'
|
2
|
+
require 'orchard/version'
|
3
|
+
|
4
|
+
# Orchard is a Ruby library for working with Pairtrees, a filesystem hierarchy
|
5
|
+
# mapping identifiers to object directories.
|
6
|
+
#
|
7
|
+
# More information can be found at:
|
8
|
+
#
|
9
|
+
# Pairtrees for Object Storage
|
10
|
+
# https://confluence.ucop.edu/display/Curation/PairTree
|
11
|
+
#
|
12
|
+
# ==== Usage Examples
|
13
|
+
#
|
14
|
+
# Pairtree.encode('ark:/13030/xt12t3')
|
15
|
+
# # => ark+=13030=xt12t3
|
16
|
+
#
|
17
|
+
# Pairtree.decode('ark+=13030=xt12t3')
|
18
|
+
# # => ark:/13030/xt12t3
|
19
|
+
#
|
20
|
+
# Pairtree.id_to_ppath('ark:/13030/xt12t3')
|
21
|
+
# # => ar/k+/=1/30/30/=x/t1/2t/3
|
22
|
+
#
|
23
|
+
# Pairtree.ppath_to_id('ar/k+/=1/30/30/=x/t1/2t/3')
|
24
|
+
# # => ark:/13030/xt12t3
|
25
|
+
|
26
|
+
module Orchard
|
27
|
+
class InvalidPPathError < StandardError
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
module Orchard
|
2
|
+
# Provides a set of methods for working with Pairtree paths.
|
3
|
+
class Pairtree
|
4
|
+
MAX_SHORTY = 2
|
5
|
+
ENCODE_REGEX = /[\"*+,<=>?\\^|]|[^\x21-\x7e]/u
|
6
|
+
DECODE_REGEX = /\^(..)|(.)/u
|
7
|
+
PPATH_REGEX = /^(?:pairtree_root\/)?((?>[^:\/\.|.]{2}\/)*[^:\/\.|.]{1,2})(?:\/?$)/
|
8
|
+
CHAR_ENCODE_CONV = {'/'=>'=',':'=>'+','.'=>','}
|
9
|
+
CHAR_DECODE_CONV = {'='=>'/','+'=>':',','=>'.'}
|
10
|
+
|
11
|
+
# Encodes a given +id+ <em>(String)</em> according to the "identifier string
|
12
|
+
# cleaning" in the pairtree 0.1 specification.
|
13
|
+
#
|
14
|
+
# encode(id)
|
15
|
+
#
|
16
|
+
# ==== Examples
|
17
|
+
#
|
18
|
+
# Pairtree.encode('ark:/13030/xt12t3')
|
19
|
+
# # => ark+=13030=xt12t3
|
20
|
+
#
|
21
|
+
# Pairtree.encode('http://n2t.info/urn:nbn:se:kb:repos-1')
|
22
|
+
# # => http+==n2t,info=urn+nbn+se+kb+repos-1
|
23
|
+
#
|
24
|
+
# Pairtree.encode('what-the-*@?#!^!?')
|
25
|
+
# # => what-the-^2a@^3f#!^5e!^3f
|
26
|
+
#
|
27
|
+
# ==== Explanation (From Pairtree 0.1 Specification)
|
28
|
+
#
|
29
|
+
# Identifier string cleaning
|
30
|
+
#
|
31
|
+
# Prior to splitting into character pairs, identifier strings are cleaned in
|
32
|
+
# two separate steps. One step would be simpler, but pairtree is designed so
|
33
|
+
# that commonly used characters in reasonably opaque identifiers (e.g., not
|
34
|
+
# containing natural language words, phrases, or hints) result in reasonably
|
35
|
+
# short and familiar-looking paths. For completeness, the pairtree algorithm
|
36
|
+
# specifies what to do with all possible UTF-8 characters, and relies for this
|
37
|
+
# on a kind of URL hex-encoding. To avoid conflict with URLs, pairtree
|
38
|
+
# hex-encoding is introduced with the '^' character instead of '%'.
|
39
|
+
#
|
40
|
+
# First, the identifier string is cleaned of characters that are expected to
|
41
|
+
# occur rarely in object identifiers but that would cause certain known
|
42
|
+
# problems for file systems. In this step, every UTF-8 octet outside the range
|
43
|
+
# of visible ASCII (94 characters with hexadecimal codes 21-7e) [ASCII], as
|
44
|
+
# well as the following visible ASCII characters, must be converted to
|
45
|
+
# their corresponding 3-character hexadecimal encoding, ^hh, where ^ is a
|
46
|
+
# circumflex and hh is two hex digits. For example, ' ' (space) is converted
|
47
|
+
# to ^20 and '*' to ^2a. In the second step, the following single-character to
|
48
|
+
# single-character conversions must be done. These are characters that occur
|
49
|
+
# quite commonly in opaque identifiers but present special problems for
|
50
|
+
# filesystems. This step avoids requiring them to be hex encoded (hence
|
51
|
+
# expanded to three characters), which keeps the typical ppath reasonably
|
52
|
+
# short. Here are examples of identifier strings after cleaning and after
|
53
|
+
# ppath mapping.
|
54
|
+
#
|
55
|
+
def self.encode(id)
|
56
|
+
#first pass
|
57
|
+
first_pass_id = id.gsub(ENCODE_REGEX) { |m| m.bytes.map{|b| "^%02x"%b }.join}
|
58
|
+
|
59
|
+
# second pass
|
60
|
+
second_pass_id = first_pass_id.split(//).collect { |char| CHAR_ENCODE_CONV[char] || char}.join
|
61
|
+
end
|
62
|
+
|
63
|
+
# Decodes a given +id+ <em>(String)</em>according to the pairtree 0.1 specifiaation.
|
64
|
+
#
|
65
|
+
# encode(id)
|
66
|
+
#
|
67
|
+
# ==== Examples
|
68
|
+
#
|
69
|
+
# Pairtree.decode('ark+=13030=xt12t3')
|
70
|
+
# # => ark:/13030/xt12t3
|
71
|
+
#
|
72
|
+
# Pairtree.decode('http+==n2t,info=urn+nbn+se+kb+repos-1')
|
73
|
+
# # => http://n2t.info/urn:nbn:se:kb:repos-1
|
74
|
+
#
|
75
|
+
# Pairtree.decode('what-the-^2a@^3f#!^5e!^3f')
|
76
|
+
# # => what-the-*@?#!^!?
|
77
|
+
#
|
78
|
+
def self.decode(id)
|
79
|
+
# first pass (reverse second from encode)
|
80
|
+
first_pass_id = id.split(//).collect { |char| CHAR_DECODE_CONV[char] || char}.join
|
81
|
+
|
82
|
+
# second pass (reverse first from encode)
|
83
|
+
second_pass_id = first_pass_id.scan(DECODE_REGEX).map {|coded,chr| coded.nil? ? chr.ord : coded.hex}.pack('C*').force_encoding('utf-8')
|
84
|
+
end
|
85
|
+
|
86
|
+
# Constructs the pairpath for a given +id+ <em>(String)</em> and +options+.
|
87
|
+
#
|
88
|
+
# id_to_ppath(id, options = {})
|
89
|
+
#
|
90
|
+
# ==== Options
|
91
|
+
# * <tt>:prefix => Pairtree prefix</tt> - This will remove the prefix from the id
|
92
|
+
# before creating a pairpath.
|
93
|
+
#
|
94
|
+
# ==== Examples
|
95
|
+
#
|
96
|
+
# Pairtree.id_to_ppath('abcde')
|
97
|
+
# # => ab/cd/e
|
98
|
+
#
|
99
|
+
# or with the prefix option
|
100
|
+
#
|
101
|
+
# Pairtree.id_to_ppath('http://dom.org/abcde', :prefix => 'http://dom.org/')
|
102
|
+
# # => ab/cd/e
|
103
|
+
#
|
104
|
+
# ==== Explanation (From Pairtree 0.1 Specification) The basic pairtree algorithm
|
105
|
+
#
|
106
|
+
# The pairtree algorithm maps an arbitrary UTF-8 [RFC3629] encoded identifier
|
107
|
+
# string into a filesystem directory path based on successive pairs of
|
108
|
+
# characters, and also defines the reverse mapping (from pathname to
|
109
|
+
# identifier).
|
110
|
+
#
|
111
|
+
# In this document the word "directory" is used interchangeably with the word
|
112
|
+
# "folder" and all examples conform to Unix-based filesystem conventions which
|
113
|
+
# should tranlate easily to Windows conventions after substituting the path
|
114
|
+
# separator ('\' instead of '/'). Pairtree places no limitations on file and
|
115
|
+
# pathlengths, so implementors thinking about maximal interoperation may
|
116
|
+
# wish to consider the issues listed in the Interoperability section of
|
117
|
+
# this document.
|
118
|
+
#
|
119
|
+
# The mapping from identifier string to path has two parts. First, the string
|
120
|
+
# is cleaned by converting characters that would be illegal or especially
|
121
|
+
# problemmaticin Unix or Windows filesystems. The cleaned string is then
|
122
|
+
# split into pairs of characters, each of which becomes a directory name
|
123
|
+
# in a filesystem path: successive pairs map to successive path components
|
124
|
+
# until there are no characters left, with the last component being either
|
125
|
+
# a 1- or 2-character directory name. The resulting path is known as
|
126
|
+
# a pairpath, or ppath.
|
127
|
+
#
|
128
|
+
# abcd -> ab/cd/
|
129
|
+
# abcdefg -> ab/cd/ef/g/
|
130
|
+
# 12-986xy4 -> 12/-9/86/xy/4/
|
131
|
+
#
|
132
|
+
def self.id_to_ppath(*args)
|
133
|
+
id = args[0]
|
134
|
+
options = args[1] || {}
|
135
|
+
id.sub!(/^#{options[:prefix]}/,'') unless options[:prefix].nil?
|
136
|
+
self.string_to_dirpath(self.encode(id), MAX_SHORTY)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Reconstructs the id for a given +pairpath+ and +options+.
|
140
|
+
#
|
141
|
+
# ppath_to_id(id, options = {})
|
142
|
+
# # id is a String
|
143
|
+
#
|
144
|
+
# ==== Options
|
145
|
+
# * <tt>:prefix => Pairtree prefix</tt> - This will remove the prefix from the id
|
146
|
+
# before creating a pairpath.
|
147
|
+
#
|
148
|
+
# ==== Examples
|
149
|
+
#
|
150
|
+
# Pairtree.ppath_to_id('ab/cd/e')
|
151
|
+
# # => abcde
|
152
|
+
#
|
153
|
+
# or with the prefix option
|
154
|
+
#
|
155
|
+
# Pairtree.ppath_to_id('ab/cd/e', :prefix => 'http://dom.org/')
|
156
|
+
# # => http://dom.org/abcde
|
157
|
+
#
|
158
|
+
def self.ppath_to_id(*args)
|
159
|
+
ppath = args[0]
|
160
|
+
options = args[1] || {}
|
161
|
+
match = ppath.match(PPATH_REGEX)
|
162
|
+
if match.nil?
|
163
|
+
throw InvalidPPathError
|
164
|
+
end
|
165
|
+
id = self.decode(match[1].delete('/'))
|
166
|
+
options[:prefix].nil? ? id : options[:prefix] + id
|
167
|
+
end
|
168
|
+
|
169
|
+
private
|
170
|
+
# Internal - split a string into a directory path by shorty length.
|
171
|
+
def self.string_to_dirpath(s, dir_length_max)
|
172
|
+
s.gsub(/(.{#{dir_length_max}}|.{1,#{dir_length_max}}$)/) { |m| $1.nil? ? m : m + '/' }
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: orchard
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
version: "0.1"
|
9
|
+
platform: ruby
|
10
|
+
authors:
|
11
|
+
- Stephanie Collett
|
12
|
+
autorequire:
|
13
|
+
bindir: bin
|
14
|
+
cert_chain: []
|
15
|
+
|
16
|
+
date: 2010-12-17 00:00:00 -08:00
|
17
|
+
default_executable:
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: thoughtbot-shoulda
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :development
|
31
|
+
version_requirements: *id001
|
32
|
+
description: Orchard translates id strings to/from Pairtree paths for use with Pairtree file repositories.
|
33
|
+
email:
|
34
|
+
- stephanie.collett@ucop.edu
|
35
|
+
executables: []
|
36
|
+
|
37
|
+
extensions: []
|
38
|
+
|
39
|
+
extra_rdoc_files: []
|
40
|
+
|
41
|
+
files:
|
42
|
+
- lib/orchard/pairtree.rb
|
43
|
+
- lib/orchard/version.rb
|
44
|
+
- lib/orchard.rb
|
45
|
+
- LICENSE
|
46
|
+
- README.md
|
47
|
+
has_rdoc: true
|
48
|
+
homepage:
|
49
|
+
licenses: []
|
50
|
+
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options: []
|
53
|
+
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
segments:
|
70
|
+
- 1
|
71
|
+
- 3
|
72
|
+
- 6
|
73
|
+
version: 1.3.6
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 1.3.7
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: Pairtree implmentation for Ruby
|
81
|
+
test_files: []
|
82
|
+
|