gdiff 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -0
- data/COPYING.suffix_array +278 -0
- data/LICENSE.suffix_array +17 -0
- data/README +40 -0
- data/README.suffix_array +274 -0
- data/bin/gdiff +25 -0
- data/bin/gpatch +25 -0
- data/doc/classes/Diff.html +117 -0
- data/doc/classes/Diff/GDiff.html +120 -0
- data/doc/classes/Diff/GDiff/EGdiffError.html +111 -0
- data/doc/classes/Diff/GDiff/ENoGdiffStream.html +113 -0
- data/doc/classes/Diff/GDiff/EPrematureEndOfStream.html +113 -0
- data/doc/classes/Diff/GDiff/Operations.html +156 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.html +246 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000014.html +19 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000015.html +39 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000016.html +25 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000017.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.html +246 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000009.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000010.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000011.html +35 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000012.html +29 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000013.html +19 -0
- data/doc/classes/SAError.html +111 -0
- data/doc/classes/SuffixArray.html +342 -0
- data/doc/classes/SuffixArray.src/M000001.html +97 -0
- data/doc/classes/SuffixArray.src/M000002.html +73 -0
- data/doc/classes/SuffixArray.src/M000003.html +102 -0
- data/doc/classes/SuffixArray.src/M000004.html +47 -0
- data/doc/classes/SuffixArray.src/M000005.html +44 -0
- data/doc/classes/SuffixArray.src/M000006.html +33 -0
- data/doc/classes/SuffixArray.src/M000007.html +24 -0
- data/doc/classes/SuffixArray.src/M000008.html +46 -0
- data/doc/created.rid +1 -0
- data/doc/files/ext/gdiff/suffix_array/extconf_rb.html +108 -0
- data/doc/files/ext/gdiff/suffix_array/lcp_c.html +101 -0
- data/doc/files/ext/gdiff/suffix_array/sarray_c.html +101 -0
- data/doc/files/ext/gdiff/suffix_array/suffix_array_c.html +101 -0
- data/doc/files/lib/gdiff_rb.html +108 -0
- data/doc/fr_class_index.html +36 -0
- data/doc/fr_file_index.html +31 -0
- data/doc/fr_method_index.html +43 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/ext/gdiff/COPYING +278 -0
- data/ext/gdiff/LICENSE +17 -0
- data/ext/gdiff/README +274 -0
- data/ext/gdiff/extconf.rb +3 -0
- data/ext/gdiff/lcp.c +97 -0
- data/ext/gdiff/sarray.3 +145 -0
- data/ext/gdiff/sarray.c +372 -0
- data/ext/gdiff/sarray.h +13 -0
- data/ext/gdiff/suffix_array.c +510 -0
- data/lib/gdiff.rb +255 -0
- data/setup.rb +1551 -0
- data/test/tc_gdiff.rb +66 -0
- metadata +119 -0
data/ext/gdiff/LICENSE
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
FastCST -- A tool for revision control.
|
2
|
+
|
3
|
+
Copyright (C) 2004-2005 Zed A. Shaw
|
4
|
+
|
5
|
+
This program is free software; you can redistribute it and/or modify
|
6
|
+
it under the terms of the GNU General Public License as published by
|
7
|
+
the Free Software Foundation; either version 2 of the License, or
|
8
|
+
(at your option) any later version.
|
9
|
+
|
10
|
+
This program is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
GNU General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU General Public License
|
16
|
+
along with this program; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
data/ext/gdiff/README
ADDED
@@ -0,0 +1,274 @@
|
|
1
|
+
== Fast Change Set Tool
|
2
|
+
|
3
|
+
FastCST is a change set tool I created to experiment with
|
4
|
+
ideas in change management, distributed development, and alternatives to existing
|
5
|
+
tools.
|
6
|
+
|
7
|
+
== Current Features
|
8
|
+
|
9
|
+
FastCST is an evolving tool that you probably shouldn't use quite yet for
|
10
|
+
anything serious. It does support a wide range of features, but since I
|
11
|
+
started in in March 2005 it's not fully tested yet.
|
12
|
+
|
13
|
+
The current laundry list of features are:
|
14
|
+
|
15
|
+
* Complete changesets that are very simple by design and encode complete
|
16
|
+
cohesive changes.
|
17
|
+
* Extensible meta-data for changesets.
|
18
|
+
* A simple to use repository that should be easy for any other languages to read.
|
19
|
+
* All revisions are uniquely identified by a UUID to avoid clashing.
|
20
|
+
* Ability to undo any applied revision.
|
21
|
+
* You can publish your repository to an FTP site efficiently.
|
22
|
+
* Directly share your repository for quick small-scale sharing without FTP.
|
23
|
+
* Anyone can download the latest revisions from an FTP or HTTP site.
|
24
|
+
* Send/receive changesets through e-mail including human readable meta-data.
|
25
|
+
* Transaction oriented operation.
|
26
|
+
* Remarkably fast operation considering there's been no optimization done and it's written in Ruby.
|
27
|
+
* A reasonable shell prompt so you don't have to type 'fcst' in front of everything.
|
28
|
+
* A working plugins feature letting you implement your own commands, with an example
|
29
|
+
command that creates release archives.
|
30
|
+
* A working "triggers" feature so you can easily wrap commands with your own logic.
|
31
|
+
* Ability to attach external files to distribute with your changesets (not tested much).
|
32
|
+
* Disposition references so you can reference bug trackers, web sites, mailing list posts, etc.
|
33
|
+
* 95% pure Ruby.
|
34
|
+
* A unique delta algorithm that uses suffix arrays and produces smaller deltas than most other
|
35
|
+
delta algorithms without sacrificing speed much.
|
36
|
+
* A painstakingly consistent command interface with extensive help for each option.
|
37
|
+
* The beginning of a merge feature that let's you merge without needing to reference history.
|
38
|
+
* An "index" command that will use suffix arrays to index your files and let you find where
|
39
|
+
text is mentioned. This will turn into an optimization and an advanced search feature.
|
40
|
+
* Simple and consistent aborting and undo so you can trash your source in the comfort of your
|
41
|
+
own stupidity and still recover.
|
42
|
+
* Completely redesigned YAML format that avoids using !ruby object loading and uses only
|
43
|
+
native types found in all languages. This avoids some security concerns, but YAML needs
|
44
|
+
a means of telling it to NOT load arbitrary objects. Still searching, but I may have to
|
45
|
+
dump YAML if I can't fix it.
|
46
|
+
* Makes a reasonable attempt to deal with symlinks and directories. It treats directories
|
47
|
+
as out of band data and simply fixes them up at the end of the application process. This
|
48
|
+
is very handy since you can just glance at the meta-data to find out if someone is deleting
|
49
|
+
your favorite source tree.
|
50
|
+
|
51
|
+
|
52
|
+
== Missing Features
|
53
|
+
|
54
|
+
There's still quite a lot of stuff missing that I want to put into FastCST, but these are
|
55
|
+
the big ones that it needs before it's useful:
|
56
|
+
|
57
|
+
* Merging is implemented, but conflict resolution is not yet. It currently will not let
|
58
|
+
you resolve conflicts and refuses to do the merge.
|
59
|
+
* Recovering individual files from the repository. This is needed to get conflicts working.
|
60
|
+
* Better security protections like not using YAML for the journal file or somehow restricting
|
61
|
+
what objects can be loaded (that thing is like giving a toddler a shot-gun).
|
62
|
+
* Digitally signed and verified revisions so people can confirm who sent the revision.
|
63
|
+
* Improved safety checks. It's pretty good now, but things like applying a delta is still
|
64
|
+
not as safe as I'd like.
|
65
|
+
|
66
|
+
Some of the things I'd like to implement are:
|
67
|
+
|
68
|
+
* Connecting with FAM or Dazuko to let FastCST track your actions and warn about bad stuff.
|
69
|
+
* Flexible command aliasing that lets you create alternative commands.
|
70
|
+
* Using mDNS to let people quickly and painlessly find repositories and other developers.
|
71
|
+
* Hooking into DamageControl and the RSCM library.
|
72
|
+
|
73
|
+
If you have any suggestions for these or for other features you want then go ahead and
|
74
|
+
contact me at zedshaw AT zedshaw DOT com.
|
75
|
+
|
76
|
+
== Security Warnings
|
77
|
+
|
78
|
+
DO NOT ACCEPT CHANGESETS FROM PEOPLE YOU DO NOT KNOW. Since there's no digital signature
|
79
|
+
capabilities this means nobody except yourself. The reason why is because there are many
|
80
|
+
places whe YAML is used, but YAML doesn't provide a mechanism for restricting what can
|
81
|
+
be loaded when unmarshalling Ruby structures. It would be no problem for someone to create
|
82
|
+
a meta-data or journal file with a bit of code to destroy your world.
|
83
|
+
|
84
|
+
Another really big caution is please don't use this thing on any source you feel is really
|
85
|
+
important. It is still mostly ALPHA stage so there's a very good chance that you'll destroy
|
86
|
+
your world if you use it. Especially important is that the delta algorithm is fairly new
|
87
|
+
and the suffix array library needs to be audited more.
|
88
|
+
|
89
|
+
Finally, it uses POP3 and FTP with bare passwords. This is mostly because I'm not sure how
|
90
|
+
to get the Net::POP and Net::FTP stuff to use APOP and/or SSL. For the most part I'm just
|
91
|
+
tunneling the protocols through SSH to my servers.
|
92
|
+
|
93
|
+
|
94
|
+
== License
|
95
|
+
|
96
|
+
Copyright (C) 2004-2005 Zed A. Shaw
|
97
|
+
|
98
|
+
This program is free software; you can redistribute it and/or modify
|
99
|
+
it under the terms of the GNU General Public License as published by
|
100
|
+
the Free Software Foundation; either version 2 of the License, or
|
101
|
+
(at your option) any later version.
|
102
|
+
|
103
|
+
This program is distributed in the hope that it will be useful,
|
104
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
105
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
106
|
+
GNU General Public License for more details.
|
107
|
+
|
108
|
+
You should have received a copy of the GNU General Public License
|
109
|
+
along with this program; if not, write to the Free Software
|
110
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
111
|
+
|
112
|
+
|
113
|
+
=== Building
|
114
|
+
|
115
|
+
The script that does everything is called "fcst", and the Rakefile is setup to build
|
116
|
+
a completely stand-alone version. This is the version that you can run with only a
|
117
|
+
basic ruby 1.8 install rather than installing all the files in your Ruby directories.
|
118
|
+
|
119
|
+
Building the fcst script this way requires simply typing "rake" in the source
|
120
|
+
directory. When the build is done you'll have a single ruby script in build/fcst.
|
121
|
+
|
122
|
+
|
123
|
+
==== Debian Notes
|
124
|
+
|
125
|
+
The build has been tested on ArchLinux and Debian, but to get it to build or
|
126
|
+
work under Debian you'll need to do some magic apt-get work:
|
127
|
+
|
128
|
+
1. Remove anything remotely related to ruby. The package layout changed recently
|
129
|
+
so this is necessary to get it to reinstall correctly.
|
130
|
+
2. apt-get install ruby (not ruby1.8). If you're on the right version of Debian
|
131
|
+
(of the 100) you should get the 1.8 stuff with all the goodies.
|
132
|
+
3. Finally make sure you have these packages as well:
|
133
|
+
* rake
|
134
|
+
* ruby1.8-dev (yes, specify the 1.8 this time)
|
135
|
+
* libtest-unit-ruby (no 1.8 this time)
|
136
|
+
|
137
|
+
Once you do this you can then use the "rake" command to build everything and get
|
138
|
+
your stand-alone build/fcst script.
|
139
|
+
|
140
|
+
|
141
|
+
=== Installing
|
142
|
+
|
143
|
+
You can also "install" the fcst script in the normal ruby way using the setup.rb
|
144
|
+
script. This installs the required libraries in your Ruby installation's directories
|
145
|
+
and thus requires root access.
|
146
|
+
|
147
|
+
The first thing you need to do is go into the software directory and install the
|
148
|
+
PluginFactory, ruby-guid, and rubymail tar.gz sources you find. Each project has
|
149
|
+
its own install method, but most use the standard setup.rb or similar. Read their
|
150
|
+
instructions.
|
151
|
+
|
152
|
+
Then installing is done with "ruby setup.rb" in the FastCST source directory. This
|
153
|
+
will install the fcst script in your standard bin directory and the required library
|
154
|
+
files in your standard Ruby setup. You'll probably need root access for this.
|
155
|
+
|
156
|
+
|
157
|
+
== Getting Started
|
158
|
+
|
159
|
+
FastCST is designed to be easy to use without restricting your ability to
|
160
|
+
get stuff done. This means that most commands are very simple and follow
|
161
|
+
a consistent pattern of operation with extensive help. It also means that,
|
162
|
+
if you don't like a command or need an additional feature then you can easily
|
163
|
+
write your own.
|
164
|
+
|
165
|
+
The fcst script has two modes of operation:
|
166
|
+
|
167
|
+
* shell mode -- This lets you run commands in a small little shell so that
|
168
|
+
you don't have to keep typing 'fcst' before you do things.
|
169
|
+
* command line mode -- You can also just put the command after the fcst
|
170
|
+
script in your normal shell and it will run it directly.
|
171
|
+
|
172
|
+
The current shell is very primitive and doesn't support readline or many other
|
173
|
+
nice features.
|
174
|
+
|
175
|
+
If you want to get a list of commands then type 'help'. It actually isn't
|
176
|
+
a command, but the error message for giving an invalid command is to print
|
177
|
+
a list of all the commands available.
|
178
|
+
|
179
|
+
|
180
|
+
=== Available Commands
|
181
|
+
|
182
|
+
FastCST currently supports the following commands:
|
183
|
+
|
184
|
+
* abort -- Aborts any currently in-process revision
|
185
|
+
* apply -- Applies a child revision to the current revision
|
186
|
+
* attach -- Attaches a file to the current in-process revision
|
187
|
+
* begin -- Starts a new in-process revision
|
188
|
+
* disp -- Adds a "disposition" reference to the in-process revision
|
189
|
+
* env -- Lists, searches, sets, and deletes environment variables
|
190
|
+
* finish -- Finalizes an in-process revision
|
191
|
+
* get -- Gets the latest published revisions from a URL
|
192
|
+
* init -- Sets up a new .fastcst repository in the current directory
|
193
|
+
* list -- Let's you list revisions and your current revision path
|
194
|
+
* log -- Adds a log line to the in-process revision (think Change Log)
|
195
|
+
* publish -- Publishes your repository to an FTP site so others can use get
|
196
|
+
* read -- Reads your "pending revision inbox" which contains received revisions
|
197
|
+
* recv -- Receives revisions from a POP3 account and stores them for read
|
198
|
+
* send -- Sends a chosen revision to a chosen person
|
199
|
+
* show -- Shows information about a revision, or the in-process revision
|
200
|
+
* status -- Gives a quick status of the current state of things
|
201
|
+
* undo -- Rolls back the most recent apply
|
202
|
+
* merge -- Basic revision merging. Does not handle conflicts yet (will abort).
|
203
|
+
* index -- Basic suffix array based searching through files.
|
204
|
+
|
205
|
+
All commands have a -h option, and will also complain if everything isn't perfect
|
206
|
+
when you run them. To get the list of commands just you can use ? or help.
|
207
|
+
|
208
|
+
|
209
|
+
=== Your First Repository
|
210
|
+
|
211
|
+
FastCST uses a "project local" repository design. This means that, rather
|
212
|
+
than keeping everything in one master directory, it creates one directory
|
213
|
+
for each project. The directory is called .fastcst and simply sits at the
|
214
|
+
top of your source files.
|
215
|
+
|
216
|
+
What we're going to do is actually get the latest version of FastCST using
|
217
|
+
fcst. Make sure you've downloaded the script or built your own and then
|
218
|
+
do this:
|
219
|
+
|
220
|
+
1. Create an empty directory and cd into it.
|
221
|
+
2. Run fcst
|
222
|
+
3. > init -e <your e-mail> -n <your name> -p fastcst
|
223
|
+
4. > get -u http://www.zedshaw.com/fastcst/repository
|
224
|
+
5. > list
|
225
|
+
6. > show -r root
|
226
|
+
7. > apply -r root
|
227
|
+
|
228
|
+
You now have the root revision of the FastCST 0.6 stuff. The list and show commands
|
229
|
+
aren't necessary, just a good idea so you don't apply something that will cause you
|
230
|
+
damage.
|
231
|
+
|
232
|
+
One current limit of fastcst is that when you get a remote repository it doesn't
|
233
|
+
ask you to apply all the revisions so you can sync up. I'm holding off on that
|
234
|
+
feature until I get the merging algorithm done. Until then, feel free to
|
235
|
+
apply the patches and play with the apply/merge/undo stuff.
|
236
|
+
|
237
|
+
==== Other Things To Try
|
238
|
+
|
239
|
+
Here's some other stuff for you to try out:
|
240
|
+
|
241
|
+
1. You do not need to have an empty directory to put files under fcst control.
|
242
|
+
2. Make a changeset by using begin/finish. You can run begin, make some changes, and then
|
243
|
+
run finish to create the revision.
|
244
|
+
3. Most commands that take a -r also take a -i so you can be specific about which revision
|
245
|
+
to use. Revision names are mostly just for weak humans who can't handle uuids.
|
246
|
+
4. Use list, show, and status religiously to figure out what's going on.
|
247
|
+
5. You can easily send a revision to someone else. Use send to give them one, and then
|
248
|
+
recv to get them from a POP3 account. It works like e-mail where they are downloaded,
|
249
|
+
and then you use 'read' to pick which ones you really want added to your repository.
|
250
|
+
6. Most commands will look for settings in the environment and use them as defaults.
|
251
|
+
Try running a command without parameters and read the lines that talk about not finding
|
252
|
+
a setting in the environment. For example, recv will look for 'POP3 Host' so you can
|
253
|
+
do "env -s 'POP3 Host' -v mail.myserver.com:110" and it will never bug you again. If you
|
254
|
+
need to use a different one temporarily, then specifying the option will override the
|
255
|
+
environment setting.
|
256
|
+
7. If you've got an FTP server and web server to play with, then you can try the publish/get
|
257
|
+
commands.
|
258
|
+
8. All commands take a -h option so you can get extensive help.
|
259
|
+
9. Try the index command. Run it once without options to build the index, then check -h.
|
260
|
+
|
261
|
+
=== More Detailed Documentation
|
262
|
+
|
263
|
+
This README just gives you some quick advice to get you started. Better
|
264
|
+
documentation is available at http://www.zedshaw.com/projects/fastcst including
|
265
|
+
the full API documentation (which is also in doc/rdoc), tutorials, and several
|
266
|
+
other nice things.
|
267
|
+
|
268
|
+
=== Reporting Bugs
|
269
|
+
|
270
|
+
Feel free to contact me at zedshaw AT zedshaw DOT com with any bug reports you have.
|
271
|
+
You may also find the latest release information at
|
272
|
+
http://www.zedshaw.com/projects/fastcst as well as contact me on irc.freenode.org
|
273
|
+
in the #ruby-lang or #rubyonrails channel.
|
274
|
+
|
data/ext/gdiff/lcp.c
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "sarray.h"
|
4
|
+
|
5
|
+
/*
|
6
|
+
int *lcp(const int *a, const char *s, int n)
|
7
|
+
Precondition: a is suffix array for string s of
|
8
|
+
length n *including* the terminating '\0'.
|
9
|
+
Return value: longest-common-prefix array; 0 on error.
|
10
|
+
Reference: T. Kasai, G. Lee, H. Arimura, S.Arikawa
|
11
|
+
and K. Park, "Linear-time longest-common-prefix
|
12
|
+
computation in suffix arrays and its applications",
|
13
|
+
Proc 12th Annual Conference on Combinatorial Pattern
|
14
|
+
Matching, Springer, LNCS 2089 (2001) 181-192.
|
15
|
+
|
16
|
+
lcp[x] is the length of the longest common prefix of
|
17
|
+
suffixes s[a[x-1]..] and s[a[x]..].
|
18
|
+
|
19
|
+
The algorithm determines the elements of lcp in the
|
20
|
+
order that the suffixes occur in s. It uses this fact:
|
21
|
+
If the lcp for suffix s[i..] has length h, where h>0,
|
22
|
+
then the lcp for suffix s[i+1..] is at least h-1.
|
23
|
+
|
24
|
+
Proof.
|
25
|
+
Let the immediate lexicographic predecessor of suffix
|
26
|
+
s[i..] be s[j..], i.e. lex[i]=lex[j]+1.
|
27
|
+
If s[i..] and s[j..] have a common prefix of length h,
|
28
|
+
where h>0, then s[i+1..] and s[j+1..] have a common
|
29
|
+
prefix of length h-1.
|
30
|
+
Since s[i+1..] and s[j+1..] differ from s[i..] and s[j..]
|
31
|
+
respectively only by the deletion of a common first
|
32
|
+
letter, the two pairs must be similarly ordered.
|
33
|
+
Hence s[j+1..] lexicographically precedes s[i+1..].
|
34
|
+
Since s[i+1..] shares a common prefix of length h-1 with
|
35
|
+
some lexicographic predecessor, namely s[j+1..], it
|
36
|
+
must share a common prefix of length at least h-1 with
|
37
|
+
its immediate predecessor. Otherwise the suffix array
|
38
|
+
would be out of order.
|
39
|
+
|
40
|
+
Running time is O(n).
|
41
|
+
|
42
|
+
Proof.
|
43
|
+
h is bounded by n; and h is decreased by 1 at most
|
44
|
+
n times. Hence h is increased at most 2n times.
|
45
|
+
This bounds the number of executions of the inner loop.
|
46
|
+
*/
|
47
|
+
|
48
|
+
/*
|
49
|
+
inv is the inverse of a: if inv[i]=x then a[x]=i.
|
50
|
+
In other words, inv[i] is the index x of the
|
51
|
+
pointer (in array a) to suffix s[i..].
|
52
|
+
*/
|
53
|
+
|
54
|
+
int*
|
55
|
+
lcp(const int *a, const char *s, int n)
|
56
|
+
{
|
57
|
+
int *lcp = (int*)malloc(n*sizeof(int));
|
58
|
+
|
59
|
+
if(lcp == 0)
|
60
|
+
return 0;
|
61
|
+
if(lcpa(a, s, lcp, n) == 0) {
|
62
|
+
free(lcp);
|
63
|
+
return 0;
|
64
|
+
}
|
65
|
+
return lcp;
|
66
|
+
}
|
67
|
+
|
68
|
+
/* lcpa is used by the java native method */
|
69
|
+
|
70
|
+
int
|
71
|
+
lcpa(const int *a, const char *s0, int *lcp, int n)
|
72
|
+
{
|
73
|
+
int i, h;
|
74
|
+
uchar *s = (uchar*)s0;
|
75
|
+
int *inv = (int*)malloc(n*sizeof(int));
|
76
|
+
|
77
|
+
if(inv == 0)
|
78
|
+
return 0;
|
79
|
+
for(i=0; i<n; i++)
|
80
|
+
inv[a[i]] = i;
|
81
|
+
|
82
|
+
h = 0; /* visit in string order */
|
83
|
+
for(i=0; i<n-1; i++) { /* omit last, least suff */
|
84
|
+
int x = inv[i]; /* i,j,x,h as in intro */
|
85
|
+
int j = a[x-1];
|
86
|
+
uchar *p1 = s + i + h;
|
87
|
+
uchar *p0 = s + j + h;
|
88
|
+
while(*p1++ == *p0++)
|
89
|
+
h++;
|
90
|
+
lcp[x] = h;
|
91
|
+
if(h > 0)
|
92
|
+
h--;
|
93
|
+
}
|
94
|
+
lcp[0] = 0; /* least suffix has no predecessor */
|
95
|
+
free(inv);
|
96
|
+
return 1;
|
97
|
+
}
|
data/ext/gdiff/sarray.3
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
.TH SARRAY 3
|
2
|
+
.SH NAME
|
3
|
+
sarray, ssarray, bsarray, lcp, scode \- suffix-array functions
|
4
|
+
.SH SYNOPSIS
|
5
|
+
.nf
|
6
|
+
.ft B
|
7
|
+
#include "sarray.h"
|
8
|
+
int sarray(int *a, int n);
|
9
|
+
int ssarray(int *a);
|
10
|
+
int bsarray(const unsigned char *s, int *a, int n);
|
11
|
+
int *lcp(const int *a, const char *s, int n);
|
12
|
+
unsigned char *scode(const char *s);
|
13
|
+
.SH DESCRIPTION
|
14
|
+
.I Sarray
|
15
|
+
and
|
16
|
+
.I ssarray
|
17
|
+
convert array
|
18
|
+
.I a
|
19
|
+
into a suffix array for
|
20
|
+
.I a.
|
21
|
+
The
|
22
|
+
.I n
|
23
|
+
values in
|
24
|
+
.I a
|
25
|
+
must form a contiguous set of integers in the range 0 to some positive value,
|
26
|
+
with 0 occurring only as an endmark, in
|
27
|
+
.IR a [ n \-1].
|
28
|
+
.PP
|
29
|
+
.I Bsarray
|
30
|
+
builds, in
|
31
|
+
.IR a
|
32
|
+
(of length
|
33
|
+
.IR n +1),
|
34
|
+
a suffix array for the
|
35
|
+
.IR n -byte
|
36
|
+
string
|
37
|
+
.IR b ,
|
38
|
+
which need not contain an endmark.
|
39
|
+
.PP
|
40
|
+
All three suffix-array builders return the index
|
41
|
+
at which the whole string is identified in
|
42
|
+
.IR a .
|
43
|
+
(This value is used in Burrows-Wheeler data compression.)
|
44
|
+
.PP
|
45
|
+
.I Lcp
|
46
|
+
returns an array
|
47
|
+
.IR l ,
|
48
|
+
in which
|
49
|
+
.IR l [ j ]
|
50
|
+
is the length of the longest common prefix of
|
51
|
+
the suffixes identified by
|
52
|
+
.IR a [ j \-1]
|
53
|
+
and
|
54
|
+
.IR a [ j ],
|
55
|
+
except
|
56
|
+
.IR l [0]=0.
|
57
|
+
It runs in time
|
58
|
+
.IR O ( n )
|
59
|
+
and uses temporary space equal in size to
|
60
|
+
.IR a .
|
61
|
+
.PP
|
62
|
+
.I Scode
|
63
|
+
returns an encoding of string
|
64
|
+
.I s
|
65
|
+
in a form suitable for input to
|
66
|
+
.I sarray
|
67
|
+
or
|
68
|
+
.I ssarray.
|
69
|
+
.SS Explanation
|
70
|
+
Suffix arrays are
|
71
|
+
useful for information retrieval, biological sequence analysis,
|
72
|
+
plagiarism detection, data compression, linguistic analysis, etc.
|
73
|
+
.PP
|
74
|
+
A suffix array
|
75
|
+
identifies, in lexicographic order, the (positions of) the
|
76
|
+
suffixes of a given string.
|
77
|
+
Thus the suffix array for the string "abab",
|
78
|
+
including its final null character, is
|
79
|
+
{4,2,0,3,1},
|
80
|
+
identifying the suffixes "",
|
81
|
+
"ab", "abab", "b", "bab".
|
82
|
+
Equivalently, it identifies circular shifts
|
83
|
+
in lexicographic order. For the string "abab", with #
|
84
|
+
as a visible endmark, the shifts are "#abab", "ab#ab",
|
85
|
+
"abab#", "b#aba", "bab#a".
|
86
|
+
.PP
|
87
|
+
The three array-building functions run in time
|
88
|
+
.IR O ( n " log " n ).
|
89
|
+
.I Sarray
|
90
|
+
and
|
91
|
+
.I bsarray
|
92
|
+
use a hybrid algorithm, typically several times
|
93
|
+
as fast as the deliberately simple
|
94
|
+
.I ssarray .
|
95
|
+
All three require temporary space equal in size to
|
96
|
+
.I a.
|
97
|
+
Space overhead may be reduced by using
|
98
|
+
.IR qsort (3)
|
99
|
+
with a suitable comparison function,
|
100
|
+
but running time then becomes at best
|
101
|
+
.IR O ( nm " log " n )
|
102
|
+
.I m
|
103
|
+
is the length of the longest repeated substring.
|
104
|
+
.SH EXAMPLES
|
105
|
+
.HP
|
106
|
+
Build, in
|
107
|
+
.I a
|
108
|
+
and
|
109
|
+
.I l
|
110
|
+
respectively,
|
111
|
+
a suffix array for string
|
112
|
+
.I s
|
113
|
+
and the associated lcp array.
|
114
|
+
.
|
115
|
+
.br
|
116
|
+
.nf
|
117
|
+
.ft CW
|
118
|
+
int *l;
|
119
|
+
int n = strlen(s)+1;
|
120
|
+
int *a = scode(s);
|
121
|
+
sarray(a, n);
|
122
|
+
l = lcp(a, s, n);
|
123
|
+
.fi
|
124
|
+
.HP
|
125
|
+
Build the same suffix array, using
|
126
|
+
.I bsarray.
|
127
|
+
.br
|
128
|
+
.nf
|
129
|
+
.ft CW
|
130
|
+
int n = strlen(s);
|
131
|
+
int *a = malloc((n+1)*sizeof(int));
|
132
|
+
bsarray((unsigned char*)s, a, n);
|
133
|
+
.SH "RETURN VALUE
|
134
|
+
.I Sarray,
|
135
|
+
.I ssarray,
|
136
|
+
and
|
137
|
+
.I bsarray
|
138
|
+
return \-1 for bad data or insufficient space.
|
139
|
+
.PP
|
140
|
+
.I Lcp
|
141
|
+
and
|
142
|
+
.I scode
|
143
|
+
return
|
144
|
+
.IR malloc 'ed
|
145
|
+
arrays, or 0 for bad data or insufficient space.
|