gdiff 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +7 -0
- data/COPYING.suffix_array +278 -0
- data/LICENSE.suffix_array +17 -0
- data/README +40 -0
- data/README.suffix_array +274 -0
- data/bin/gdiff +25 -0
- data/bin/gpatch +25 -0
- data/doc/classes/Diff.html +117 -0
- data/doc/classes/Diff/GDiff.html +120 -0
- data/doc/classes/Diff/GDiff/EGdiffError.html +111 -0
- data/doc/classes/Diff/GDiff/ENoGdiffStream.html +113 -0
- data/doc/classes/Diff/GDiff/EPrematureEndOfStream.html +113 -0
- data/doc/classes/Diff/GDiff/Operations.html +156 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.html +246 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000014.html +19 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000015.html +39 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000016.html +25 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000017.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.html +246 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000009.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000010.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000011.html +35 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000012.html +29 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000013.html +19 -0
- data/doc/classes/SAError.html +111 -0
- data/doc/classes/SuffixArray.html +342 -0
- data/doc/classes/SuffixArray.src/M000001.html +97 -0
- data/doc/classes/SuffixArray.src/M000002.html +73 -0
- data/doc/classes/SuffixArray.src/M000003.html +102 -0
- data/doc/classes/SuffixArray.src/M000004.html +47 -0
- data/doc/classes/SuffixArray.src/M000005.html +44 -0
- data/doc/classes/SuffixArray.src/M000006.html +33 -0
- data/doc/classes/SuffixArray.src/M000007.html +24 -0
- data/doc/classes/SuffixArray.src/M000008.html +46 -0
- data/doc/created.rid +1 -0
- data/doc/files/ext/gdiff/suffix_array/extconf_rb.html +108 -0
- data/doc/files/ext/gdiff/suffix_array/lcp_c.html +101 -0
- data/doc/files/ext/gdiff/suffix_array/sarray_c.html +101 -0
- data/doc/files/ext/gdiff/suffix_array/suffix_array_c.html +101 -0
- data/doc/files/lib/gdiff_rb.html +108 -0
- data/doc/fr_class_index.html +36 -0
- data/doc/fr_file_index.html +31 -0
- data/doc/fr_method_index.html +43 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/ext/gdiff/COPYING +278 -0
- data/ext/gdiff/LICENSE +17 -0
- data/ext/gdiff/README +274 -0
- data/ext/gdiff/extconf.rb +3 -0
- data/ext/gdiff/lcp.c +97 -0
- data/ext/gdiff/sarray.3 +145 -0
- data/ext/gdiff/sarray.c +372 -0
- data/ext/gdiff/sarray.h +13 -0
- data/ext/gdiff/suffix_array.c +510 -0
- data/lib/gdiff.rb +255 -0
- data/setup.rb +1551 -0
- data/test/tc_gdiff.rb +66 -0
- metadata +119 -0
data/ext/gdiff/LICENSE
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
FastCST -- A tool for revision control.
|
2
|
+
|
3
|
+
Copyright (C) 2004-2005 Zed A. Shaw
|
4
|
+
|
5
|
+
This program is free software; you can redistribute it and/or modify
|
6
|
+
it under the terms of the GNU General Public License as published by
|
7
|
+
the Free Software Foundation; either version 2 of the License, or
|
8
|
+
(at your option) any later version.
|
9
|
+
|
10
|
+
This program is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
GNU General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU General Public License
|
16
|
+
along with this program; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
data/ext/gdiff/README
ADDED
@@ -0,0 +1,274 @@
|
|
1
|
+
== Fast Change Set Tool
|
2
|
+
|
3
|
+
FastCST is a change set tool I created to experiment with
|
4
|
+
ideas in change management, distributed development, and alternatives to existing
|
5
|
+
tools.
|
6
|
+
|
7
|
+
== Current Features
|
8
|
+
|
9
|
+
FastCST is an evolving tool that you probably shouldn't use quite yet for
|
10
|
+
anything serious. It does support a wide range of features, but since I
|
11
|
+
started in in March 2005 it's not fully tested yet.
|
12
|
+
|
13
|
+
The current laundry list of features are:
|
14
|
+
|
15
|
+
* Complete changesets that are very simple by design and encode complete
|
16
|
+
cohesive changes.
|
17
|
+
* Extensible meta-data for changesets.
|
18
|
+
* A simple to use repository that should be easy for any other languages to read.
|
19
|
+
* All revisions are uniquely identified by a UUID to avoid clashing.
|
20
|
+
* Ability to undo any applied revision.
|
21
|
+
* You can publish your repository to an FTP site efficiently.
|
22
|
+
* Directly share your repository for quick small-scale sharing without FTP.
|
23
|
+
* Anyone can download the latest revisions from an FTP or HTTP site.
|
24
|
+
* Send/receive changesets through e-mail including human readable meta-data.
|
25
|
+
* Transaction oriented operation.
|
26
|
+
* Remarkably fast operation considering there's been no optimization done and it's written in Ruby.
|
27
|
+
* A reasonable shell prompt so you don't have to type 'fcst' in front of everything.
|
28
|
+
* A working plugins feature letting you implement your own commands, with an example
|
29
|
+
command that creates release archives.
|
30
|
+
* A working "triggers" feature so you can easily wrap commands with your own logic.
|
31
|
+
* Ability to attach external files to distribute with your changesets (not tested much).
|
32
|
+
* Disposition references so you can reference bug trackers, web sites, mailing list posts, etc.
|
33
|
+
* 95% pure Ruby.
|
34
|
+
* A unique delta algorithm that uses suffix arrays and produces smaller deltas than most other
|
35
|
+
delta algorithms without sacrificing speed much.
|
36
|
+
* A painstakingly consistent command interface with extensive help for each option.
|
37
|
+
* The beginning of a merge feature that let's you merge without needing to reference history.
|
38
|
+
* An "index" command that will use suffix arrays to index your files and let you find where
|
39
|
+
text is mentioned. This will turn into an optimization and an advanced search feature.
|
40
|
+
* Simple and consistent aborting and undo so you can trash your source in the comfort of your
|
41
|
+
own stupidity and still recover.
|
42
|
+
* Completely redesigned YAML format that avoids using !ruby object loading and uses only
|
43
|
+
native types found in all languages. This avoids some security concerns, but YAML needs
|
44
|
+
a means of telling it to NOT load arbitrary objects. Still searching, but I may have to
|
45
|
+
dump YAML if I can't fix it.
|
46
|
+
* Makes a reasonable attempt to deal with symlinks and directories. It treats directories
|
47
|
+
as out of band data and simply fixes them up at the end of the application process. This
|
48
|
+
is very handy since you can just glance at the meta-data to find out if someone is deleting
|
49
|
+
your favorite source tree.
|
50
|
+
|
51
|
+
|
52
|
+
== Missing Features
|
53
|
+
|
54
|
+
There's still quite a lot of stuff missing that I want to put into FastCST, but these are
|
55
|
+
the big ones that it needs before it's useful:
|
56
|
+
|
57
|
+
* Merging is implemented, but conflict resolution is not yet. It currently will not let
|
58
|
+
you resolve conflicts and refuses to do the merge.
|
59
|
+
* Recovering individual files from the repository. This is needed to get conflicts working.
|
60
|
+
* Better security protections like not using YAML for the journal file or somehow restricting
|
61
|
+
what objects can be loaded (that thing is like giving a toddler a shot-gun).
|
62
|
+
* Digitally signed and verified revisions so people can confirm who sent the revision.
|
63
|
+
* Improved safety checks. It's pretty good now, but things like applying a delta is still
|
64
|
+
not as safe as I'd like.
|
65
|
+
|
66
|
+
Some of the things I'd like to implement are:
|
67
|
+
|
68
|
+
* Connecting with FAM or Dazuko to let FastCST track your actions and warn about bad stuff.
|
69
|
+
* Flexible command aliasing that lets you create alternative commands.
|
70
|
+
* Using mDNS to let people quickly and painlessly find repositories and other developers.
|
71
|
+
* Hooking into DamageControl and the RSCM library.
|
72
|
+
|
73
|
+
If you have any suggestions for these or for other features you want then go ahead and
|
74
|
+
contact me at zedshaw AT zedshaw DOT com.
|
75
|
+
|
76
|
+
== Security Warnings
|
77
|
+
|
78
|
+
DO NOT ACCEPT CHANGESETS FROM PEOPLE YOU DO NOT KNOW. Since there's no digital signature
|
79
|
+
capabilities this means nobody except yourself. The reason why is because there are many
|
80
|
+
places whe YAML is used, but YAML doesn't provide a mechanism for restricting what can
|
81
|
+
be loaded when unmarshalling Ruby structures. It would be no problem for someone to create
|
82
|
+
a meta-data or journal file with a bit of code to destroy your world.
|
83
|
+
|
84
|
+
Another really big caution is please don't use this thing on any source you feel is really
|
85
|
+
important. It is still mostly ALPHA stage so there's a very good chance that you'll destroy
|
86
|
+
your world if you use it. Especially important is that the delta algorithm is fairly new
|
87
|
+
and the suffix array library needs to be audited more.
|
88
|
+
|
89
|
+
Finally, it uses POP3 and FTP with bare passwords. This is mostly because I'm not sure how
|
90
|
+
to get the Net::POP and Net::FTP stuff to use APOP and/or SSL. For the most part I'm just
|
91
|
+
tunneling the protocols through SSH to my servers.
|
92
|
+
|
93
|
+
|
94
|
+
== License
|
95
|
+
|
96
|
+
Copyright (C) 2004-2005 Zed A. Shaw
|
97
|
+
|
98
|
+
This program is free software; you can redistribute it and/or modify
|
99
|
+
it under the terms of the GNU General Public License as published by
|
100
|
+
the Free Software Foundation; either version 2 of the License, or
|
101
|
+
(at your option) any later version.
|
102
|
+
|
103
|
+
This program is distributed in the hope that it will be useful,
|
104
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
105
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
106
|
+
GNU General Public License for more details.
|
107
|
+
|
108
|
+
You should have received a copy of the GNU General Public License
|
109
|
+
along with this program; if not, write to the Free Software
|
110
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
111
|
+
|
112
|
+
|
113
|
+
=== Building
|
114
|
+
|
115
|
+
The script that does everything is called "fcst", and the Rakefile is setup to build
|
116
|
+
a completely stand-alone version. This is the version that you can run with only a
|
117
|
+
basic ruby 1.8 install rather than installing all the files in your Ruby directories.
|
118
|
+
|
119
|
+
Building the fcst script this way requires simply typing "rake" in the source
|
120
|
+
directory. When the build is done you'll have a single ruby script in build/fcst.
|
121
|
+
|
122
|
+
|
123
|
+
==== Debian Notes
|
124
|
+
|
125
|
+
The build has been tested on ArchLinux and Debian, but to get it to build or
|
126
|
+
work under Debian you'll need to do some magic apt-get work:
|
127
|
+
|
128
|
+
1. Remove anything remotely related to ruby. The package layout changed recently
|
129
|
+
so this is necessary to get it to reinstall correctly.
|
130
|
+
2. apt-get install ruby (not ruby1.8). If you're on the right version of Debian
|
131
|
+
(of the 100) you should get the 1.8 stuff with all the goodies.
|
132
|
+
3. Finally make sure you have these packages as well:
|
133
|
+
* rake
|
134
|
+
* ruby1.8-dev (yes, specify the 1.8 this time)
|
135
|
+
* libtest-unit-ruby (no 1.8 this time)
|
136
|
+
|
137
|
+
Once you do this you can then use the "rake" command to build everything and get
|
138
|
+
your stand-alone build/fcst script.
|
139
|
+
|
140
|
+
|
141
|
+
=== Installing
|
142
|
+
|
143
|
+
You can also "install" the fcst script in the normal ruby way using the setup.rb
|
144
|
+
script. This installs the required libraries in your Ruby installation's directories
|
145
|
+
and thus requires root access.
|
146
|
+
|
147
|
+
The first thing you need to do is go into the software directory and install the
|
148
|
+
PluginFactory, ruby-guid, and rubymail tar.gz sources you find. Each project has
|
149
|
+
its own install method, but most use the standard setup.rb or similar. Read their
|
150
|
+
instructions.
|
151
|
+
|
152
|
+
Then installing is done with "ruby setup.rb" in the FastCST source directory. This
|
153
|
+
will install the fcst script in your standard bin directory and the required library
|
154
|
+
files in your standard Ruby setup. You'll probably need root access for this.
|
155
|
+
|
156
|
+
|
157
|
+
== Getting Started
|
158
|
+
|
159
|
+
FastCST is designed to be easy to use without restricting your ability to
|
160
|
+
get stuff done. This means that most commands are very simple and follow
|
161
|
+
a consistent pattern of operation with extensive help. It also means that,
|
162
|
+
if you don't like a command or need an additional feature then you can easily
|
163
|
+
write your own.
|
164
|
+
|
165
|
+
The fcst script has two modes of operation:
|
166
|
+
|
167
|
+
* shell mode -- This lets you run commands in a small little shell so that
|
168
|
+
you don't have to keep typing 'fcst' before you do things.
|
169
|
+
* command line mode -- You can also just put the command after the fcst
|
170
|
+
script in your normal shell and it will run it directly.
|
171
|
+
|
172
|
+
The current shell is very primitive and doesn't support readline or many other
|
173
|
+
nice features.
|
174
|
+
|
175
|
+
If you want to get a list of commands then type 'help'. It actually isn't
|
176
|
+
a command, but the error message for giving an invalid command is to print
|
177
|
+
a list of all the commands available.
|
178
|
+
|
179
|
+
|
180
|
+
=== Available Commands
|
181
|
+
|
182
|
+
FastCST currently supports the following commands:
|
183
|
+
|
184
|
+
* abort -- Aborts any currently in-process revision
|
185
|
+
* apply -- Applies a child revision to the current revision
|
186
|
+
* attach -- Attaches a file to the current in-process revision
|
187
|
+
* begin -- Starts a new in-process revision
|
188
|
+
* disp -- Adds a "disposition" reference to the in-process revision
|
189
|
+
* env -- Lists, searches, sets, and deletes environment variables
|
190
|
+
* finish -- Finalizes an in-process revision
|
191
|
+
* get -- Gets the latest published revisions from a URL
|
192
|
+
* init -- Sets up a new .fastcst repository in the current directory
|
193
|
+
* list -- Let's you list revisions and your current revision path
|
194
|
+
* log -- Adds a log line to the in-process revision (think Change Log)
|
195
|
+
* publish -- Publishes your repository to an FTP site so others can use get
|
196
|
+
* read -- Reads your "pending revision inbox" which contains received revisions
|
197
|
+
* recv -- Receives revisions from a POP3 account and stores them for read
|
198
|
+
* send -- Sends a chosen revision to a chosen person
|
199
|
+
* show -- Shows information about a revision, or the in-process revision
|
200
|
+
* status -- Gives a quick status of the current state of things
|
201
|
+
* undo -- Rolls back the most recent apply
|
202
|
+
* merge -- Basic revision merging. Does not handle conflicts yet (will abort).
|
203
|
+
* index -- Basic suffix array based searching through files.
|
204
|
+
|
205
|
+
All commands have a -h option, and will also complain if everything isn't perfect
|
206
|
+
when you run them. To get the list of commands just you can use ? or help.
|
207
|
+
|
208
|
+
|
209
|
+
=== Your First Repository
|
210
|
+
|
211
|
+
FastCST uses a "project local" repository design. This means that, rather
|
212
|
+
than keeping everything in one master directory, it creates one directory
|
213
|
+
for each project. The directory is called .fastcst and simply sits at the
|
214
|
+
top of your source files.
|
215
|
+
|
216
|
+
What we're going to do is actually get the latest version of FastCST using
|
217
|
+
fcst. Make sure you've downloaded the script or built your own and then
|
218
|
+
do this:
|
219
|
+
|
220
|
+
1. Create an empty directory and cd into it.
|
221
|
+
2. Run fcst
|
222
|
+
3. > init -e <your e-mail> -n <your name> -p fastcst
|
223
|
+
4. > get -u http://www.zedshaw.com/fastcst/repository
|
224
|
+
5. > list
|
225
|
+
6. > show -r root
|
226
|
+
7. > apply -r root
|
227
|
+
|
228
|
+
You now have the root revision of the FastCST 0.6 stuff. The list and show commands
|
229
|
+
aren't necessary, just a good idea so you don't apply something that will cause you
|
230
|
+
damage.
|
231
|
+
|
232
|
+
One current limit of fastcst is that when you get a remote repository it doesn't
|
233
|
+
ask you to apply all the revisions so you can sync up. I'm holding off on that
|
234
|
+
feature until I get the merging algorithm done. Until then, feel free to
|
235
|
+
apply the patches and play with the apply/merge/undo stuff.
|
236
|
+
|
237
|
+
==== Other Things To Try
|
238
|
+
|
239
|
+
Here's some other stuff for you to try out:
|
240
|
+
|
241
|
+
1. You do not need to have an empty directory to put files under fcst control.
|
242
|
+
2. Make a changeset by using begin/finish. You can run begin, make some changes, and then
|
243
|
+
run finish to create the revision.
|
244
|
+
3. Most commands that take a -r also take a -i so you can be specific about which revision
|
245
|
+
to use. Revision names are mostly just for weak humans who can't handle uuids.
|
246
|
+
4. Use list, show, and status religiously to figure out what's going on.
|
247
|
+
5. You can easily send a revision to someone else. Use send to give them one, and then
|
248
|
+
recv to get them from a POP3 account. It works like e-mail where they are downloaded,
|
249
|
+
and then you use 'read' to pick which ones you really want added to your repository.
|
250
|
+
6. Most commands will look for settings in the environment and use them as defaults.
|
251
|
+
Try running a command without parameters and read the lines that talk about not finding
|
252
|
+
a setting in the environment. For example, recv will look for 'POP3 Host' so you can
|
253
|
+
do "env -s 'POP3 Host' -v mail.myserver.com:110" and it will never bug you again. If you
|
254
|
+
need to use a different one temporarily, then specifying the option will override the
|
255
|
+
environment setting.
|
256
|
+
7. If you've got an FTP server and web server to play with, then you can try the publish/get
|
257
|
+
commands.
|
258
|
+
8. All commands take a -h option so you can get extensive help.
|
259
|
+
9. Try the index command. Run it once without options to build the index, then check -h.
|
260
|
+
|
261
|
+
=== More Detailed Documentation
|
262
|
+
|
263
|
+
This README just gives you some quick advice to get you started. Better
|
264
|
+
documentation is available at http://www.zedshaw.com/projects/fastcst including
|
265
|
+
the full API documentation (which is also in doc/rdoc), tutorials, and several
|
266
|
+
other nice things.
|
267
|
+
|
268
|
+
=== Reporting Bugs
|
269
|
+
|
270
|
+
Feel free to contact me at zedshaw AT zedshaw DOT com with any bug reports you have.
|
271
|
+
You may also find the latest release information at
|
272
|
+
http://www.zedshaw.com/projects/fastcst as well as contact me on irc.freenode.org
|
273
|
+
in the #ruby-lang or #rubyonrails channel.
|
274
|
+
|
data/ext/gdiff/lcp.c
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "sarray.h"
|
4
|
+
|
5
|
+
/*
|
6
|
+
int *lcp(const int *a, const char *s, int n)
|
7
|
+
Precondition: a is suffix array for string s of
|
8
|
+
length n *including* the terminating '\0'.
|
9
|
+
Return value: longest-common-prefix array; 0 on error.
|
10
|
+
Reference: T. Kasai, G. Lee, H. Arimura, S.Arikawa
|
11
|
+
and K. Park, "Linear-time longest-common-prefix
|
12
|
+
computation in suffix arrays and its applications",
|
13
|
+
Proc 12th Annual Conference on Combinatorial Pattern
|
14
|
+
Matching, Springer, LNCS 2089 (2001) 181-192.
|
15
|
+
|
16
|
+
lcp[x] is the length of the longest common prefix of
|
17
|
+
suffixes s[a[x-1]..] and s[a[x]..].
|
18
|
+
|
19
|
+
The algorithm determines the elements of lcp in the
|
20
|
+
order that the suffixes occur in s. It uses this fact:
|
21
|
+
If the lcp for suffix s[i..] has length h, where h>0,
|
22
|
+
then the lcp for suffix s[i+1..] is at least h-1.
|
23
|
+
|
24
|
+
Proof.
|
25
|
+
Let the immediate lexicographic predecessor of suffix
|
26
|
+
s[i..] be s[j..], i.e. lex[i]=lex[j]+1.
|
27
|
+
If s[i..] and s[j..] have a common prefix of length h,
|
28
|
+
where h>0, then s[i+1..] and s[j+1..] have a common
|
29
|
+
prefix of length h-1.
|
30
|
+
Since s[i+1..] and s[j+1..] differ from s[i..] and s[j..]
|
31
|
+
respectively only by the deletion of a common first
|
32
|
+
letter, the two pairs must be similarly ordered.
|
33
|
+
Hence s[j+1..] lexicographically precedes s[i+1..].
|
34
|
+
Since s[i+1..] shares a common prefix of length h-1 with
|
35
|
+
some lexicographic predecessor, namely s[j+1..], it
|
36
|
+
must share a common prefix of length at least h-1 with
|
37
|
+
its immediate predecessor. Otherwise the suffix array
|
38
|
+
would be out of order.
|
39
|
+
|
40
|
+
Running time is O(n).
|
41
|
+
|
42
|
+
Proof.
|
43
|
+
h is bounded by n; and h is decreased by 1 at most
|
44
|
+
n times. Hence h is increased at most 2n times.
|
45
|
+
This bounds the number of executions of the inner loop.
|
46
|
+
*/
|
47
|
+
|
48
|
+
/*
|
49
|
+
inv is the inverse of a: if inv[i]=x then a[x]=i.
|
50
|
+
In other words, inv[i] is the index x of the
|
51
|
+
pointer (in array a) to suffix s[i..].
|
52
|
+
*/
|
53
|
+
|
54
|
+
int*
|
55
|
+
lcp(const int *a, const char *s, int n)
|
56
|
+
{
|
57
|
+
int *lcp = (int*)malloc(n*sizeof(int));
|
58
|
+
|
59
|
+
if(lcp == 0)
|
60
|
+
return 0;
|
61
|
+
if(lcpa(a, s, lcp, n) == 0) {
|
62
|
+
free(lcp);
|
63
|
+
return 0;
|
64
|
+
}
|
65
|
+
return lcp;
|
66
|
+
}
|
67
|
+
|
68
|
+
/* lcpa is used by the java native method */
|
69
|
+
|
70
|
+
int
|
71
|
+
lcpa(const int *a, const char *s0, int *lcp, int n)
|
72
|
+
{
|
73
|
+
int i, h;
|
74
|
+
uchar *s = (uchar*)s0;
|
75
|
+
int *inv = (int*)malloc(n*sizeof(int));
|
76
|
+
|
77
|
+
if(inv == 0)
|
78
|
+
return 0;
|
79
|
+
for(i=0; i<n; i++)
|
80
|
+
inv[a[i]] = i;
|
81
|
+
|
82
|
+
h = 0; /* visit in string order */
|
83
|
+
for(i=0; i<n-1; i++) { /* omit last, least suff */
|
84
|
+
int x = inv[i]; /* i,j,x,h as in intro */
|
85
|
+
int j = a[x-1];
|
86
|
+
uchar *p1 = s + i + h;
|
87
|
+
uchar *p0 = s + j + h;
|
88
|
+
while(*p1++ == *p0++)
|
89
|
+
h++;
|
90
|
+
lcp[x] = h;
|
91
|
+
if(h > 0)
|
92
|
+
h--;
|
93
|
+
}
|
94
|
+
lcp[0] = 0; /* least suffix has no predecessor */
|
95
|
+
free(inv);
|
96
|
+
return 1;
|
97
|
+
}
|
data/ext/gdiff/sarray.3
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
.TH SARRAY 3
|
2
|
+
.SH NAME
|
3
|
+
sarray, ssarray, bsarray, lcp, scode \- suffix-array functions
|
4
|
+
.SH SYNOPSIS
|
5
|
+
.nf
|
6
|
+
.ft B
|
7
|
+
#include "sarray.h"
|
8
|
+
int sarray(int *a, int n);
|
9
|
+
int ssarray(int *a);
|
10
|
+
int bsarray(const unsigned char *s, int *a, int n);
|
11
|
+
int *lcp(const int *a, const char *s, int n);
|
12
|
+
unsigned char *scode(const char *s);
|
13
|
+
.SH DESCRIPTION
|
14
|
+
.I Sarray
|
15
|
+
and
|
16
|
+
.I ssarray
|
17
|
+
convert array
|
18
|
+
.I a
|
19
|
+
into a suffix array for
|
20
|
+
.I a.
|
21
|
+
The
|
22
|
+
.I n
|
23
|
+
values in
|
24
|
+
.I a
|
25
|
+
must form a contiguous set of integers in the range 0 to some positive value,
|
26
|
+
with 0 occurring only as an endmark, in
|
27
|
+
.IR a [ n \-1].
|
28
|
+
.PP
|
29
|
+
.I Bsarray
|
30
|
+
builds, in
|
31
|
+
.IR a
|
32
|
+
(of length
|
33
|
+
.IR n +1),
|
34
|
+
a suffix array for the
|
35
|
+
.IR n -byte
|
36
|
+
string
|
37
|
+
.IR b ,
|
38
|
+
which need not contain an endmark.
|
39
|
+
.PP
|
40
|
+
All three suffix-array builders return the index
|
41
|
+
at which the whole string is identified in
|
42
|
+
.IR a .
|
43
|
+
(This value is used in Burrows-Wheeler data compression.)
|
44
|
+
.PP
|
45
|
+
.I Lcp
|
46
|
+
returns an array
|
47
|
+
.IR l ,
|
48
|
+
in which
|
49
|
+
.IR l [ j ]
|
50
|
+
is the length of the longest common prefix of
|
51
|
+
the suffixes identified by
|
52
|
+
.IR a [ j \-1]
|
53
|
+
and
|
54
|
+
.IR a [ j ],
|
55
|
+
except
|
56
|
+
.IR l [0]=0.
|
57
|
+
It runs in time
|
58
|
+
.IR O ( n )
|
59
|
+
and uses temporary space equal in size to
|
60
|
+
.IR a .
|
61
|
+
.PP
|
62
|
+
.I Scode
|
63
|
+
returns an encoding of string
|
64
|
+
.I s
|
65
|
+
in a form suitable for input to
|
66
|
+
.I sarray
|
67
|
+
or
|
68
|
+
.I ssarray.
|
69
|
+
.SS Explanation
|
70
|
+
Suffix arrays are
|
71
|
+
useful for information retrieval, biological sequence analysis,
|
72
|
+
plagiarism detection, data compression, linguistic analysis, etc.
|
73
|
+
.PP
|
74
|
+
A suffix array
|
75
|
+
identifies, in lexicographic order, the (positions of) the
|
76
|
+
suffixes of a given string.
|
77
|
+
Thus the suffix array for the string "abab",
|
78
|
+
including its final null character, is
|
79
|
+
{4,2,0,3,1},
|
80
|
+
identifying the suffixes "",
|
81
|
+
"ab", "abab", "b", "bab".
|
82
|
+
Equivalently, it identifies circular shifts
|
83
|
+
in lexicographic order. For the string "abab", with #
|
84
|
+
as a visible endmark, the shifts are "#abab", "ab#ab",
|
85
|
+
"abab#", "b#aba", "bab#a".
|
86
|
+
.PP
|
87
|
+
The three array-building functions run in time
|
88
|
+
.IR O ( n " log " n ).
|
89
|
+
.I Sarray
|
90
|
+
and
|
91
|
+
.I bsarray
|
92
|
+
use a hybrid algorithm, typically several times
|
93
|
+
as fast as the deliberately simple
|
94
|
+
.I ssarray .
|
95
|
+
All three require temporary space equal in size to
|
96
|
+
.I a.
|
97
|
+
Space overhead may be reduced by using
|
98
|
+
.IR qsort (3)
|
99
|
+
with a suitable comparison function,
|
100
|
+
but running time then becomes at best
|
101
|
+
.IR O ( nm " log " n )
|
102
|
+
.I m
|
103
|
+
is the length of the longest repeated substring.
|
104
|
+
.SH EXAMPLES
|
105
|
+
.HP
|
106
|
+
Build, in
|
107
|
+
.I a
|
108
|
+
and
|
109
|
+
.I l
|
110
|
+
respectively,
|
111
|
+
a suffix array for string
|
112
|
+
.I s
|
113
|
+
and the associated lcp array.
|
114
|
+
.
|
115
|
+
.br
|
116
|
+
.nf
|
117
|
+
.ft CW
|
118
|
+
int *l;
|
119
|
+
int n = strlen(s)+1;
|
120
|
+
int *a = scode(s);
|
121
|
+
sarray(a, n);
|
122
|
+
l = lcp(a, s, n);
|
123
|
+
.fi
|
124
|
+
.HP
|
125
|
+
Build the same suffix array, using
|
126
|
+
.I bsarray.
|
127
|
+
.br
|
128
|
+
.nf
|
129
|
+
.ft CW
|
130
|
+
int n = strlen(s);
|
131
|
+
int *a = malloc((n+1)*sizeof(int));
|
132
|
+
bsarray((unsigned char*)s, a, n);
|
133
|
+
.SH "RETURN VALUE
|
134
|
+
.I Sarray,
|
135
|
+
.I ssarray,
|
136
|
+
and
|
137
|
+
.I bsarray
|
138
|
+
return \-1 for bad data or insufficient space.
|
139
|
+
.PP
|
140
|
+
.I Lcp
|
141
|
+
and
|
142
|
+
.I scode
|
143
|
+
return
|
144
|
+
.IR malloc 'ed
|
145
|
+
arrays, or 0 for bad data or insufficient space.
|