dalla-data-processing 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dalla/__init__.py +27 -0
- dalla/cli.py +453 -0
- dalla/core/__init__.py +6 -0
- dalla/core/dataset.py +387 -0
- dalla/core/parallel.py +279 -0
- dalla/deduplication/__init__.py +370 -0
- dalla/deduplication/bin/.gitignore +1 -0
- dalla/deduplication/bin/onion-linux-x86_64 +0 -0
- dalla/deduplication/onion/COPYING +24 -0
- dalla/deduplication/onion/Makefile +21 -0
- dalla/deduplication/onion/Makefile.config +3 -0
- dalla/deduplication/onion/README.md +21 -0
- dalla/deduplication/onion/src/Makefile +22 -0
- dalla/deduplication/onion/src/Makefile.g +23 -0
- dalla/deduplication/onion/src/buzhash.c +325 -0
- dalla/deduplication/onion/src/buzhash.h +30 -0
- dalla/deduplication/onion/src/hashdup.c +172 -0
- dalla/deduplication/onion/src/hashgen.c +206 -0
- dalla/deduplication/onion/src/onion +0 -0
- dalla/deduplication/onion/src/onion.c +799 -0
- dalla/deduplication/onion/src/onion_dup.c +824 -0
- dalla/deduplication/onion/src/version.c +17 -0
- dalla/deduplication/onion/src/version.h +10 -0
- dalla/deduplication/onion/src_sc/Makefile +22 -0
- dalla/deduplication/onion/src_sc/Makefile.g +23 -0
- dalla/deduplication/onion/src_sc/buzhash.c +325 -0
- dalla/deduplication/onion/src_sc/buzhash.h +30 -0
- dalla/deduplication/onion/src_sc/hashdup +0 -0
- dalla/deduplication/onion/src_sc/hashdup.c +172 -0
- dalla/deduplication/onion/src_sc/hashgen +0 -0
- dalla/deduplication/onion/src_sc/hashgen.c +206 -0
- dalla/deduplication/onion/src_sc/onion.c +854 -0
- dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
- dalla/deduplication/onion/src_sc/version.c +17 -0
- dalla/deduplication/onion/src_sc/version.h +10 -0
- dalla/deduplication/onion_wrapper.py +223 -0
- dalla/deduplication/postprocessing.py +216 -0
- dalla/deduplication/preprocessing.py +120 -0
- dalla/quality/__init__.py +5 -0
- dalla/quality/checker.py +354 -0
- dalla/readability/__init__.py +197 -0
- dalla/readability/ranking.py +165 -0
- dalla/readability/scorer.py +148 -0
- dalla/stemming/__init__.py +551 -0
- dalla/stemming/data/words_al.txt +3414 -0
- dalla/stemming/data/words_al_t.txt +885 -0
- dalla/stemming/data/words_t.txt +7 -0
- dalla/utils/__init__.py +10 -0
- dalla/utils/logger.py +128 -0
- dalla/utils/tokenize.py +89 -0
- dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
- dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
- dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
- dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
- dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2016 Jan Pomikalek, Milos Jakubicek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************/
|
|
8
|
+
|
|
9
|
+
#include <stdio.h>
|
|
10
|
+
#include <stdlib.h>
|
|
11
|
+
#include <string.h>
|
|
12
|
+
#include "version.h"
|
|
13
|
+
|
|
14
|
+
void print_version(const char* progname) {
|
|
15
|
+
printf("%s: onion v%s\n\n", progname, VERSION);
|
|
16
|
+
printf("Copyright (c) 2011-2020 Lexical Computing Limited and Lexical Computing CZ s.r.o.\n");
|
|
17
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2016 Jan Pomikalek, Milos Jakubicek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************/
|
|
8
|
+
|
|
9
|
+
#define VERSION "1.4"
|
|
10
|
+
void print_version(const char* progname);
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
include ../Makefile.config
|
|
2
|
+
|
|
3
|
+
CC=g++
|
|
4
|
+
CFLAGS=-Wall -O3
|
|
5
|
+
|
|
6
|
+
OBJS=version.o buzhash.o
|
|
7
|
+
TARGETS=hashgen hashdup onion
|
|
8
|
+
all: $(TARGETS)
|
|
9
|
+
|
|
10
|
+
$(TARGETS): $(OBJS)
|
|
11
|
+
|
|
12
|
+
%.o: %.c %.h
|
|
13
|
+
$(CC) $(CFLAGS) -c $<
|
|
14
|
+
|
|
15
|
+
%: %.c
|
|
16
|
+
$(CC) $(CFLAGS) $^ $(LIBS) -o $@
|
|
17
|
+
|
|
18
|
+
install: $(TARGETS)
|
|
19
|
+
mkdir -p $(INSTALL_BIN) && install -m 755 $(TARGETS) $(INSTALL_BIN)
|
|
20
|
+
|
|
21
|
+
clean:
|
|
22
|
+
rm -f $(OBJS) $(TARGETS)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
include ../Makefile.config
|
|
2
|
+
|
|
3
|
+
CC=g++
|
|
4
|
+
#CFLAGS=-Wall -O3
|
|
5
|
+
CFLAGS=-Wall -g
|
|
6
|
+
|
|
7
|
+
OBJS=version.o buzhash.o
|
|
8
|
+
TARGETS=hashgen hashdup onion
|
|
9
|
+
all: $(TARGETS)
|
|
10
|
+
|
|
11
|
+
$(TARGETS): $(OBJS)
|
|
12
|
+
|
|
13
|
+
%.o: %.c %.h
|
|
14
|
+
$(CC) $(CFLAGS) -c $<
|
|
15
|
+
|
|
16
|
+
%: %.c
|
|
17
|
+
$(CC) $(CFLAGS) $^ $(LIBS) -o $@
|
|
18
|
+
|
|
19
|
+
install: $(TARGETS)
|
|
20
|
+
mkdir -p $(INSTALL_BIN) && install -m 755 $(TARGETS) $(INSTALL_BIN)
|
|
21
|
+
|
|
22
|
+
clean:
|
|
23
|
+
rm -f $(OBJS) $(TARGETS)
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2015 Jan Pomikalek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************/
|
|
8
|
+
|
|
9
|
+
#include "buzhash.h"
|
|
10
|
+
#include <string.h>
|
|
11
|
+
#include <stdlib.h>
|
|
12
|
+
|
|
13
|
+
const hash_t CHAR2LONG[255] = {
|
|
14
|
+
12658332951230890439ul,
|
|
15
|
+
16607337219466274820ul,
|
|
16
|
+
4897781435750669512ul,
|
|
17
|
+
1863954398247708433ul,
|
|
18
|
+
6041299601906237138ul,
|
|
19
|
+
3602934247356726349ul,
|
|
20
|
+
13927570682514441143ul,
|
|
21
|
+
11920701378039577834ul,
|
|
22
|
+
14629533900929623503ul,
|
|
23
|
+
16546862913458629335ul,
|
|
24
|
+
10685855460932754325ul,
|
|
25
|
+
15186106020611570871ul,
|
|
26
|
+
8131473594228677807ul,
|
|
27
|
+
9287569521752445451ul,
|
|
28
|
+
5624316205208212365ul,
|
|
29
|
+
10693223548395698341ul,
|
|
30
|
+
10578473704599778022ul,
|
|
31
|
+
16693921798782755893ul,
|
|
32
|
+
15124492184888274523ul,
|
|
33
|
+
1235529881146962610ul,
|
|
34
|
+
14843219789508576687ul,
|
|
35
|
+
15526012670070475388ul,
|
|
36
|
+
6463116610490435782ul,
|
|
37
|
+
15104307767477900194ul,
|
|
38
|
+
8484741665705462025ul,
|
|
39
|
+
8100868536101218192ul,
|
|
40
|
+
3395269876321120613ul,
|
|
41
|
+
8589680476807032865ul,
|
|
42
|
+
7621819336684948355ul,
|
|
43
|
+
14153065448097834589ul,
|
|
44
|
+
6732762317790231782ul,
|
|
45
|
+
13018363043978374122ul,
|
|
46
|
+
6215216690161075437ul,
|
|
47
|
+
9357943660640904950ul,
|
|
48
|
+
12116224851753945911ul,
|
|
49
|
+
13636661669728066501ul,
|
|
50
|
+
7484247892091601413ul,
|
|
51
|
+
8512193125891820287ul,
|
|
52
|
+
10461835854496665155ul,
|
|
53
|
+
16797036920317134766ul,
|
|
54
|
+
1313270971513831546ul,
|
|
55
|
+
742840173802188917ul,
|
|
56
|
+
1249430170856643161ul,
|
|
57
|
+
17179028999057074571ul,
|
|
58
|
+
6378210156955744140ul,
|
|
59
|
+
793680677819467304ul,
|
|
60
|
+
4263415984887072454ul,
|
|
61
|
+
7875662396850393478ul,
|
|
62
|
+
17050561532048146107ul,
|
|
63
|
+
1435098142595853720ul,
|
|
64
|
+
8580942225888237636ul,
|
|
65
|
+
13308656650323976644ul,
|
|
66
|
+
16630067181906003651ul,
|
|
67
|
+
12116795942522001627ul,
|
|
68
|
+
9892291673171748547ul,
|
|
69
|
+
11660673438127243284ul,
|
|
70
|
+
6026050291617469826ul,
|
|
71
|
+
10478522635079777192ul,
|
|
72
|
+
12138158317934008218ul,
|
|
73
|
+
3518644136578100667ul,
|
|
74
|
+
4950215611630576830ul,
|
|
75
|
+
15769242181285477405ul,
|
|
76
|
+
7950690203065752077ul,
|
|
77
|
+
319974224259159447ul,
|
|
78
|
+
9604177767109474443ul,
|
|
79
|
+
2499971183666009670ul,
|
|
80
|
+
3389512945436469180ul,
|
|
81
|
+
13643083464485449791ul,
|
|
82
|
+
7197237438818751483ul,
|
|
83
|
+
11151212581995191915ul,
|
|
84
|
+
17495196072216154799ul,
|
|
85
|
+
6770497232845758508ul,
|
|
86
|
+
10987981514044724191ul,
|
|
87
|
+
14707120191905416074ul,
|
|
88
|
+
1769092362238593010ul,
|
|
89
|
+
9329650998411009452ul,
|
|
90
|
+
14719126903328637772ul,
|
|
91
|
+
16952770464905740286ul,
|
|
92
|
+
9674713352706546441ul,
|
|
93
|
+
6649376341374010415ul,
|
|
94
|
+
13209384319143003802ul,
|
|
95
|
+
15927169943220646170ul,
|
|
96
|
+
16897589646525214220ul,
|
|
97
|
+
3262252579774962994ul,
|
|
98
|
+
12644188031911778084ul,
|
|
99
|
+
12242729612781990566ul,
|
|
100
|
+
10411593575032306840ul,
|
|
101
|
+
6901591497302664456ul,
|
|
102
|
+
16282753866514979972ul,
|
|
103
|
+
1656537748780076590ul,
|
|
104
|
+
16482447327676653424ul,
|
|
105
|
+
15257560081058078415ul,
|
|
106
|
+
2959473391892618753ul,
|
|
107
|
+
6837204821782891114ul,
|
|
108
|
+
10938562237399133186ul,
|
|
109
|
+
16857781777840528196ul,
|
|
110
|
+
8483325299592247627ul,
|
|
111
|
+
8376541859638180551ul,
|
|
112
|
+
2504977066327782390ul,
|
|
113
|
+
12231409223811250404ul,
|
|
114
|
+
4744310199064570243ul,
|
|
115
|
+
17936677873798959622ul,
|
|
116
|
+
7126990633455442871ul,
|
|
117
|
+
2079219814712678870ul,
|
|
118
|
+
5067179041865164597ul,
|
|
119
|
+
2311488369720591961ul,
|
|
120
|
+
1725854410047761352ul,
|
|
121
|
+
7355938747639265690ul,
|
|
122
|
+
15490596914355917847ul,
|
|
123
|
+
2283460595124192686ul,
|
|
124
|
+
6878856348493276219ul,
|
|
125
|
+
9152647736939983958ul,
|
|
126
|
+
1662432522495537695ul,
|
|
127
|
+
11306127178924536002ul,
|
|
128
|
+
9272318044070549747ul,
|
|
129
|
+
7145744474881723964ul,
|
|
130
|
+
13448381548771200536ul,
|
|
131
|
+
16160887140379377718ul,
|
|
132
|
+
16369357319459660843ul,
|
|
133
|
+
5476117262347077406ul,
|
|
134
|
+
16602075379238506563ul,
|
|
135
|
+
11456607228896734049ul,
|
|
136
|
+
6465411526782391145ul,
|
|
137
|
+
8155612729101736593ul,
|
|
138
|
+
1740403063688953650ul,
|
|
139
|
+
4466509242016709213ul,
|
|
140
|
+
18112502299939680520ul,
|
|
141
|
+
16974090059556845575ul,
|
|
142
|
+
12326512096507303015ul,
|
|
143
|
+
15376655537080530798ul,
|
|
144
|
+
12498441914565269305ul,
|
|
145
|
+
6036826437421754258ul,
|
|
146
|
+
7912527257991934972ul,
|
|
147
|
+
16620739722007677741ul,
|
|
148
|
+
8733477150731820655ul,
|
|
149
|
+
16564684276929490022ul,
|
|
150
|
+
9409261669616170022ul,
|
|
151
|
+
8387885649776441101ul,
|
|
152
|
+
4427301691848253832ul,
|
|
153
|
+
17640389513959398145ul,
|
|
154
|
+
11987577927023442578ul,
|
|
155
|
+
1358867256273478740ul,
|
|
156
|
+
14172638869615591470ul,
|
|
157
|
+
4669134809929205329ul,
|
|
158
|
+
9146890779639199412ul,
|
|
159
|
+
2448139160410716046ul,
|
|
160
|
+
14539456923687813097ul,
|
|
161
|
+
15701779011641704372ul,
|
|
162
|
+
12184110908386419117ul,
|
|
163
|
+
6182072944631238310ul,
|
|
164
|
+
6068503614243670324ul,
|
|
165
|
+
17486237705261861510ul,
|
|
166
|
+
8141926135459860042ul,
|
|
167
|
+
11247558917664640122ul,
|
|
168
|
+
15966973352605162329ul,
|
|
169
|
+
9274584296089522436ul,
|
|
170
|
+
16106837601580129961ul,
|
|
171
|
+
5565067011055473713ul,
|
|
172
|
+
9018591362895332601ul,
|
|
173
|
+
17429669259725580644ul,
|
|
174
|
+
5862130260298638241ul,
|
|
175
|
+
10804107644379464482ul,
|
|
176
|
+
14590678293851680311ul,
|
|
177
|
+
7586397638435564357ul,
|
|
178
|
+
5024282990565981028ul,
|
|
179
|
+
17710866669113912150ul,
|
|
180
|
+
10607302159042519593ul,
|
|
181
|
+
10224690187282473862ul,
|
|
182
|
+
12691341730791771243ul,
|
|
183
|
+
446919220230245087ul,
|
|
184
|
+
11928822690215012312ul,
|
|
185
|
+
14695552131553031715ul,
|
|
186
|
+
9373710656266261295ul,
|
|
187
|
+
10535666776941439244ul,
|
|
188
|
+
4764286487123496201ul,
|
|
189
|
+
12081558227095427560ul,
|
|
190
|
+
14657526787837780677ul,
|
|
191
|
+
4854775944749701021ul,
|
|
192
|
+
18014893051074447624ul,
|
|
193
|
+
5961551484053396826ul,
|
|
194
|
+
7007393494224833114ul,
|
|
195
|
+
1918625258470397717ul,
|
|
196
|
+
2249596653018019968ul,
|
|
197
|
+
15376752853428300944ul,
|
|
198
|
+
15661589396388907215ul,
|
|
199
|
+
17959491169395034186ul,
|
|
200
|
+
7412669116831624121ul,
|
|
201
|
+
16613322186307011607ul,
|
|
202
|
+
1168394068192978862ul,
|
|
203
|
+
13541384245715877822ul,
|
|
204
|
+
17842264847294623193ul,
|
|
205
|
+
8656129051250713732ul,
|
|
206
|
+
6600363660893585591ul,
|
|
207
|
+
10437456264051898071ul,
|
|
208
|
+
6483876479559582910ul,
|
|
209
|
+
2351460095187333222ul,
|
|
210
|
+
17709647483310915437ul,
|
|
211
|
+
4687819186773626811ul,
|
|
212
|
+
12859142186029646747ul,
|
|
213
|
+
14196439022719216916ul,
|
|
214
|
+
10831194418958921226ul,
|
|
215
|
+
9958754500157295475ul,
|
|
216
|
+
2812703802823563549ul,
|
|
217
|
+
364639487745161427ul,
|
|
218
|
+
18071223067394944401ul,
|
|
219
|
+
11148005916176784196ul,
|
|
220
|
+
10887057658503987840ul,
|
|
221
|
+
7239832157577921295ul,
|
|
222
|
+
6274798767279704963ul,
|
|
223
|
+
9654930315473449062ul,
|
|
224
|
+
11342083202968693359ul,
|
|
225
|
+
8060885109403789727ul,
|
|
226
|
+
532804797012507628ul,
|
|
227
|
+
4259820420986796757ul,
|
|
228
|
+
3591121934050292837ul,
|
|
229
|
+
3739649723128072566ul,
|
|
230
|
+
11338759925899470208ul,
|
|
231
|
+
17557031182161531657ul,
|
|
232
|
+
1328316363081986551ul,
|
|
233
|
+
905104119772647733ul,
|
|
234
|
+
16162805969666123858ul,
|
|
235
|
+
13351191969939227039ul,
|
|
236
|
+
11181921000405417530ul,
|
|
237
|
+
1257129276560696939ul,
|
|
238
|
+
8049492553042309720ul,
|
|
239
|
+
8867122601488545729ul,
|
|
240
|
+
8169023185794623188ul,
|
|
241
|
+
14027174324336484013ul,
|
|
242
|
+
3026556086188399794ul,
|
|
243
|
+
7137339202398299406ul,
|
|
244
|
+
15636400854018083176ul,
|
|
245
|
+
1912758983363371197ul,
|
|
246
|
+
12934014134659659938ul,
|
|
247
|
+
6432162334519755563ul,
|
|
248
|
+
11890239098696368321ul,
|
|
249
|
+
465021739668949123ul,
|
|
250
|
+
3571688800220472097ul,
|
|
251
|
+
17356096479830501074ul,
|
|
252
|
+
17244551474859817129ul,
|
|
253
|
+
16858016682994011520ul,
|
|
254
|
+
11599911656842386375ul,
|
|
255
|
+
1384801604554958238ul,
|
|
256
|
+
10350053496655489375ul,
|
|
257
|
+
2028044935420165668ul,
|
|
258
|
+
9321839731809955516ul,
|
|
259
|
+
3800717409646038380ul,
|
|
260
|
+
508616612214119935ul,
|
|
261
|
+
14489270436014461891ul,
|
|
262
|
+
11373150082561320490ul,
|
|
263
|
+
8855221204049336307ul,
|
|
264
|
+
11920562817555372746ul,
|
|
265
|
+
17464569634060446109ul,
|
|
266
|
+
146583913832133545ul,
|
|
267
|
+
11454565731520647642ul,
|
|
268
|
+
14516679283835536061ul,
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
// 64-bit left circular shift
|
|
272
|
+
hash_t rotate_left(hash_t value, int shift) {
|
|
273
|
+
return (value << shift) | (value >> (64 - shift));
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
hash_t hash_string(char* string) {
|
|
277
|
+
hash_t hash = 0;
|
|
278
|
+
int string_len = strlen(string);
|
|
279
|
+
int i;
|
|
280
|
+
for (i=0; i<string_len; i++) {
|
|
281
|
+
hash ^= CHAR2LONG[(unsigned char) string[i]];
|
|
282
|
+
hash = rotate_left(hash, 1);
|
|
283
|
+
}
|
|
284
|
+
return hash;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
void buzhash_init_buffer(buzhash_buffer_t* buffer, int size) {
|
|
288
|
+
buffer->size = size;
|
|
289
|
+
buffer->elem_count = 0;
|
|
290
|
+
buffer->last_index = size - 1;
|
|
291
|
+
buffer->hash = 0;
|
|
292
|
+
buffer->elems = (hash_t*) malloc(size * sizeof(hash_t));
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
void buzhash_clear_buffer(buzhash_buffer_t* buffer) {
|
|
296
|
+
buffer->elem_count = 0;
|
|
297
|
+
buffer->last_index = buffer->size - 1;
|
|
298
|
+
buffer->hash = 0;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
void buzhash_free_buffer(buzhash_buffer_t* buffer) {
|
|
302
|
+
free(buffer->elems);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
int buzhash_is_full_buffer(buzhash_buffer_t* buffer) {
|
|
306
|
+
return (buffer->elem_count == buffer->size);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
hash_t buzhash(char* string, buzhash_buffer_t* buffer) {
|
|
310
|
+
hash_t string_hash = hash_string(string);
|
|
311
|
+
if (buffer->elem_count < buffer->size) {
|
|
312
|
+
buffer->last_index = (buffer->last_index + 1) % buffer->size;
|
|
313
|
+
buffer->hash = rotate_left(buffer->hash, 1) ^ string_hash;
|
|
314
|
+
buffer->elems[buffer->last_index] = string_hash;
|
|
315
|
+
buffer->elem_count++;
|
|
316
|
+
}
|
|
317
|
+
else {
|
|
318
|
+
int fst_index = (buffer->last_index + 1) % buffer->size;
|
|
319
|
+
buffer->hash = rotate_left(buffer->hash, 1) ^ rotate_left(
|
|
320
|
+
buffer->elems[fst_index], buffer->size) ^ string_hash;
|
|
321
|
+
buffer->last_index = fst_index;
|
|
322
|
+
buffer->elems[buffer->last_index] = string_hash;
|
|
323
|
+
}
|
|
324
|
+
return buffer->hash;
|
|
325
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2015 Jan Pomikalek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************/
|
|
8
|
+
|
|
9
|
+
#ifndef BUZHASH_H
|
|
10
|
+
#define BUZHASH_H
|
|
11
|
+
#define BUZHASH_MAX 18446744073709551615ul
|
|
12
|
+
|
|
13
|
+
#include <stdint.h>
|
|
14
|
+
|
|
15
|
+
typedef uint64_t hash_t;
|
|
16
|
+
typedef struct {
|
|
17
|
+
int size;
|
|
18
|
+
hash_t *elems;
|
|
19
|
+
int elem_count; // current number of elements in the buffer
|
|
20
|
+
int last_index; // the index of the last element (buffer is circular)
|
|
21
|
+
hash_t hash; // current hash value
|
|
22
|
+
} buzhash_buffer_t;
|
|
23
|
+
|
|
24
|
+
hash_t hash_string(char* string);
|
|
25
|
+
void buzhash_init_buffer(buzhash_buffer_t* buffer, int size);
|
|
26
|
+
void buzhash_clear_buffer(buzhash_buffer_t* buffer);
|
|
27
|
+
void buzhash_free_buffer(buzhash_buffer_t* buffer);
|
|
28
|
+
int buzhash_is_full_buffer(buzhash_buffer_t* buffer);
|
|
29
|
+
hash_t buzhash(char* string, buzhash_buffer_t* buffer);
|
|
30
|
+
#endif
|
|
Binary file
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2015 Jan Pomikalek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************/
|
|
8
|
+
|
|
9
|
+
#include <errno.h>
|
|
10
|
+
#include <fcntl.h>
|
|
11
|
+
#include <stdio.h>
|
|
12
|
+
#include <stdlib.h>
|
|
13
|
+
#include <string.h>
|
|
14
|
+
#include <time.h>
|
|
15
|
+
#include <unistd.h>
|
|
16
|
+
#include <sys/mman.h>
|
|
17
|
+
#include <sys/time.h>
|
|
18
|
+
#include <sys/stat.h>
|
|
19
|
+
#include <sys/types.h>
|
|
20
|
+
#include "buzhash.h"
|
|
21
|
+
#include "version.h"
|
|
22
|
+
|
|
23
|
+
#define OUTPUT_FILE "duphashes"
|
|
24
|
+
|
|
25
|
+
// options
|
|
26
|
+
char *Output_file = OUTPUT_FILE;
|
|
27
|
+
int Quiet = 0;
|
|
28
|
+
FILE* Input;
|
|
29
|
+
long int Input_size;
|
|
30
|
+
|
|
31
|
+
void print_usage(FILE *stream) {
|
|
32
|
+
fprintf(stream, "\
|
|
33
|
+
Usage: hashdup [OPTIONS] FILE [FILE...]\n\
|
|
34
|
+
Identify duplicate hashes.\n\
|
|
35
|
+
\n\
|
|
36
|
+
-o FILE output file (default: %s)\n\
|
|
37
|
+
-q quiet; suppress all output except for errors\n\
|
|
38
|
+
\n\
|
|
39
|
+
-V print version information and exit\n\
|
|
40
|
+
-h display this help and exit\n\
|
|
41
|
+
\n\
|
|
42
|
+
Project home page: <http://code.google.com/p/onion/>\n",
|
|
43
|
+
OUTPUT_FILE);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// taken from http://cs.wikipedia.org/wiki/Quicksort
|
|
47
|
+
void quicksort(hash_t array[], long int left_begin, long int right_begin) {
|
|
48
|
+
hash_t pm = array[(left_begin + right_begin) / 2];
|
|
49
|
+
long int left_index, right_index;
|
|
50
|
+
left_index = left_begin;
|
|
51
|
+
right_index = right_begin;
|
|
52
|
+
do {
|
|
53
|
+
while (array[left_index] < pm)
|
|
54
|
+
left_index++;
|
|
55
|
+
while (array[right_index] > pm)
|
|
56
|
+
right_index--;
|
|
57
|
+
if (left_index <= right_index) {
|
|
58
|
+
hash_t value = array[left_index];
|
|
59
|
+
array[left_index] = array[right_index];
|
|
60
|
+
array[right_index] = value;
|
|
61
|
+
left_index++;
|
|
62
|
+
right_index--;
|
|
63
|
+
}
|
|
64
|
+
} while (left_index < right_index);
|
|
65
|
+
if (right_index > left_begin)
|
|
66
|
+
quicksort(array, left_begin, right_index);
|
|
67
|
+
if (left_index < right_begin)
|
|
68
|
+
quicksort(array, left_index, right_begin);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
void print_progress(int processed_files, int total_files) {
|
|
72
|
+
time_t now;
|
|
73
|
+
time(&now);
|
|
74
|
+
fprintf(stderr, "[%.24s] hashdup: %i / %i files processed\n", ctime(&now),
|
|
75
|
+
processed_files, total_files);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
int main(int argc, char **argv) {
|
|
79
|
+
// get options
|
|
80
|
+
int c;
|
|
81
|
+
while ((c = getopt(argc, argv, "o:qVh")) != -1) {
|
|
82
|
+
errno = 0;
|
|
83
|
+
switch (c) {
|
|
84
|
+
case 'o':
|
|
85
|
+
Output_file = optarg;
|
|
86
|
+
break;
|
|
87
|
+
case 'q':
|
|
88
|
+
Quiet = 1;
|
|
89
|
+
break;
|
|
90
|
+
case 'V':
|
|
91
|
+
print_version("hashdup");
|
|
92
|
+
return 0;
|
|
93
|
+
case 'h':
|
|
94
|
+
print_usage(stdout);
|
|
95
|
+
return 0;
|
|
96
|
+
case '?':
|
|
97
|
+
print_usage(stderr);
|
|
98
|
+
return 1;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (optind >= argc) {
|
|
103
|
+
fprintf(stderr, "No input.\n");
|
|
104
|
+
print_usage(stderr);
|
|
105
|
+
return 1;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// output file
|
|
109
|
+
errno = 0;
|
|
110
|
+
FILE* output_fp = fopen(Output_file, "w");
|
|
111
|
+
if (errno != 0) {
|
|
112
|
+
fprintf(stderr, "Unable to open %s for writing.\n", Output_file);
|
|
113
|
+
return 1;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
int input_files_count = argc - optind;
|
|
117
|
+
|
|
118
|
+
// for all input files
|
|
119
|
+
int i;
|
|
120
|
+
for (i=optind; i<argc; i++) {
|
|
121
|
+
// open file
|
|
122
|
+
char* filename = argv[i];
|
|
123
|
+
int input_fd = open(filename, O_RDONLY);
|
|
124
|
+
if (input_fd == -1) {
|
|
125
|
+
fprintf(stderr, "Unable to open %s for reading.\n", filename);
|
|
126
|
+
return 1;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// determine file size
|
|
130
|
+
unsigned long int file_size = lseek(input_fd, 0L, SEEK_END);
|
|
131
|
+
lseek(input_fd, 0L, SEEK_SET);
|
|
132
|
+
|
|
133
|
+
// map hashes into memory
|
|
134
|
+
hash_t* hashes = NULL;
|
|
135
|
+
hashes = (hash_t*) mmap(hashes, file_size, PROT_READ | PROT_WRITE,
|
|
136
|
+
MAP_PRIVATE, input_fd, 0);
|
|
137
|
+
|
|
138
|
+
// sort hashes
|
|
139
|
+
unsigned long int hash_count = file_size / sizeof(hash_t);
|
|
140
|
+
quicksort(hashes, 0, hash_count-1);
|
|
141
|
+
|
|
142
|
+
// send duplicate hashes to the output
|
|
143
|
+
int written = 0;
|
|
144
|
+
hash_t prev_hash = hashes[0];
|
|
145
|
+
hash_t hash;
|
|
146
|
+
unsigned long int j;
|
|
147
|
+
for (j=1; j<hash_count; j++) {
|
|
148
|
+
hash = hashes[j];
|
|
149
|
+
if (hash == prev_hash) {
|
|
150
|
+
if (!written) {
|
|
151
|
+
fwrite(&hash, sizeof(hash), 1, output_fp);
|
|
152
|
+
written = 1;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
else {
|
|
156
|
+
written = 0;
|
|
157
|
+
}
|
|
158
|
+
prev_hash = hash;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
munmap(hashes, file_size);
|
|
162
|
+
close(input_fd);
|
|
163
|
+
|
|
164
|
+
// print progress information
|
|
165
|
+
if (!Quiet)
|
|
166
|
+
print_progress(i - optind + 1, input_files_count);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
fclose(output_fp);
|
|
170
|
+
|
|
171
|
+
return 0;
|
|
172
|
+
}
|
|
Binary file
|