dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,325 @@
1
+ /*********************************************************************
2
+ * Copyright (c) 2011-2015 Jan Pomikalek *
3
+ * All rights reserved. *
4
+ * *
5
+ * This software is licensed as described in the file COPYING, which *
6
+ * you should have received as part of this distribution. *
7
+ *********************************************************************/
8
+
9
+ #include "buzhash.h"
10
+ #include <string.h>
11
+ #include <stdlib.h>
12
+
13
+ const hash_t CHAR2LONG[255] = {
14
+ 12658332951230890439ul,
15
+ 16607337219466274820ul,
16
+ 4897781435750669512ul,
17
+ 1863954398247708433ul,
18
+ 6041299601906237138ul,
19
+ 3602934247356726349ul,
20
+ 13927570682514441143ul,
21
+ 11920701378039577834ul,
22
+ 14629533900929623503ul,
23
+ 16546862913458629335ul,
24
+ 10685855460932754325ul,
25
+ 15186106020611570871ul,
26
+ 8131473594228677807ul,
27
+ 9287569521752445451ul,
28
+ 5624316205208212365ul,
29
+ 10693223548395698341ul,
30
+ 10578473704599778022ul,
31
+ 16693921798782755893ul,
32
+ 15124492184888274523ul,
33
+ 1235529881146962610ul,
34
+ 14843219789508576687ul,
35
+ 15526012670070475388ul,
36
+ 6463116610490435782ul,
37
+ 15104307767477900194ul,
38
+ 8484741665705462025ul,
39
+ 8100868536101218192ul,
40
+ 3395269876321120613ul,
41
+ 8589680476807032865ul,
42
+ 7621819336684948355ul,
43
+ 14153065448097834589ul,
44
+ 6732762317790231782ul,
45
+ 13018363043978374122ul,
46
+ 6215216690161075437ul,
47
+ 9357943660640904950ul,
48
+ 12116224851753945911ul,
49
+ 13636661669728066501ul,
50
+ 7484247892091601413ul,
51
+ 8512193125891820287ul,
52
+ 10461835854496665155ul,
53
+ 16797036920317134766ul,
54
+ 1313270971513831546ul,
55
+ 742840173802188917ul,
56
+ 1249430170856643161ul,
57
+ 17179028999057074571ul,
58
+ 6378210156955744140ul,
59
+ 793680677819467304ul,
60
+ 4263415984887072454ul,
61
+ 7875662396850393478ul,
62
+ 17050561532048146107ul,
63
+ 1435098142595853720ul,
64
+ 8580942225888237636ul,
65
+ 13308656650323976644ul,
66
+ 16630067181906003651ul,
67
+ 12116795942522001627ul,
68
+ 9892291673171748547ul,
69
+ 11660673438127243284ul,
70
+ 6026050291617469826ul,
71
+ 10478522635079777192ul,
72
+ 12138158317934008218ul,
73
+ 3518644136578100667ul,
74
+ 4950215611630576830ul,
75
+ 15769242181285477405ul,
76
+ 7950690203065752077ul,
77
+ 319974224259159447ul,
78
+ 9604177767109474443ul,
79
+ 2499971183666009670ul,
80
+ 3389512945436469180ul,
81
+ 13643083464485449791ul,
82
+ 7197237438818751483ul,
83
+ 11151212581995191915ul,
84
+ 17495196072216154799ul,
85
+ 6770497232845758508ul,
86
+ 10987981514044724191ul,
87
+ 14707120191905416074ul,
88
+ 1769092362238593010ul,
89
+ 9329650998411009452ul,
90
+ 14719126903328637772ul,
91
+ 16952770464905740286ul,
92
+ 9674713352706546441ul,
93
+ 6649376341374010415ul,
94
+ 13209384319143003802ul,
95
+ 15927169943220646170ul,
96
+ 16897589646525214220ul,
97
+ 3262252579774962994ul,
98
+ 12644188031911778084ul,
99
+ 12242729612781990566ul,
100
+ 10411593575032306840ul,
101
+ 6901591497302664456ul,
102
+ 16282753866514979972ul,
103
+ 1656537748780076590ul,
104
+ 16482447327676653424ul,
105
+ 15257560081058078415ul,
106
+ 2959473391892618753ul,
107
+ 6837204821782891114ul,
108
+ 10938562237399133186ul,
109
+ 16857781777840528196ul,
110
+ 8483325299592247627ul,
111
+ 8376541859638180551ul,
112
+ 2504977066327782390ul,
113
+ 12231409223811250404ul,
114
+ 4744310199064570243ul,
115
+ 17936677873798959622ul,
116
+ 7126990633455442871ul,
117
+ 2079219814712678870ul,
118
+ 5067179041865164597ul,
119
+ 2311488369720591961ul,
120
+ 1725854410047761352ul,
121
+ 7355938747639265690ul,
122
+ 15490596914355917847ul,
123
+ 2283460595124192686ul,
124
+ 6878856348493276219ul,
125
+ 9152647736939983958ul,
126
+ 1662432522495537695ul,
127
+ 11306127178924536002ul,
128
+ 9272318044070549747ul,
129
+ 7145744474881723964ul,
130
+ 13448381548771200536ul,
131
+ 16160887140379377718ul,
132
+ 16369357319459660843ul,
133
+ 5476117262347077406ul,
134
+ 16602075379238506563ul,
135
+ 11456607228896734049ul,
136
+ 6465411526782391145ul,
137
+ 8155612729101736593ul,
138
+ 1740403063688953650ul,
139
+ 4466509242016709213ul,
140
+ 18112502299939680520ul,
141
+ 16974090059556845575ul,
142
+ 12326512096507303015ul,
143
+ 15376655537080530798ul,
144
+ 12498441914565269305ul,
145
+ 6036826437421754258ul,
146
+ 7912527257991934972ul,
147
+ 16620739722007677741ul,
148
+ 8733477150731820655ul,
149
+ 16564684276929490022ul,
150
+ 9409261669616170022ul,
151
+ 8387885649776441101ul,
152
+ 4427301691848253832ul,
153
+ 17640389513959398145ul,
154
+ 11987577927023442578ul,
155
+ 1358867256273478740ul,
156
+ 14172638869615591470ul,
157
+ 4669134809929205329ul,
158
+ 9146890779639199412ul,
159
+ 2448139160410716046ul,
160
+ 14539456923687813097ul,
161
+ 15701779011641704372ul,
162
+ 12184110908386419117ul,
163
+ 6182072944631238310ul,
164
+ 6068503614243670324ul,
165
+ 17486237705261861510ul,
166
+ 8141926135459860042ul,
167
+ 11247558917664640122ul,
168
+ 15966973352605162329ul,
169
+ 9274584296089522436ul,
170
+ 16106837601580129961ul,
171
+ 5565067011055473713ul,
172
+ 9018591362895332601ul,
173
+ 17429669259725580644ul,
174
+ 5862130260298638241ul,
175
+ 10804107644379464482ul,
176
+ 14590678293851680311ul,
177
+ 7586397638435564357ul,
178
+ 5024282990565981028ul,
179
+ 17710866669113912150ul,
180
+ 10607302159042519593ul,
181
+ 10224690187282473862ul,
182
+ 12691341730791771243ul,
183
+ 446919220230245087ul,
184
+ 11928822690215012312ul,
185
+ 14695552131553031715ul,
186
+ 9373710656266261295ul,
187
+ 10535666776941439244ul,
188
+ 4764286487123496201ul,
189
+ 12081558227095427560ul,
190
+ 14657526787837780677ul,
191
+ 4854775944749701021ul,
192
+ 18014893051074447624ul,
193
+ 5961551484053396826ul,
194
+ 7007393494224833114ul,
195
+ 1918625258470397717ul,
196
+ 2249596653018019968ul,
197
+ 15376752853428300944ul,
198
+ 15661589396388907215ul,
199
+ 17959491169395034186ul,
200
+ 7412669116831624121ul,
201
+ 16613322186307011607ul,
202
+ 1168394068192978862ul,
203
+ 13541384245715877822ul,
204
+ 17842264847294623193ul,
205
+ 8656129051250713732ul,
206
+ 6600363660893585591ul,
207
+ 10437456264051898071ul,
208
+ 6483876479559582910ul,
209
+ 2351460095187333222ul,
210
+ 17709647483310915437ul,
211
+ 4687819186773626811ul,
212
+ 12859142186029646747ul,
213
+ 14196439022719216916ul,
214
+ 10831194418958921226ul,
215
+ 9958754500157295475ul,
216
+ 2812703802823563549ul,
217
+ 364639487745161427ul,
218
+ 18071223067394944401ul,
219
+ 11148005916176784196ul,
220
+ 10887057658503987840ul,
221
+ 7239832157577921295ul,
222
+ 6274798767279704963ul,
223
+ 9654930315473449062ul,
224
+ 11342083202968693359ul,
225
+ 8060885109403789727ul,
226
+ 532804797012507628ul,
227
+ 4259820420986796757ul,
228
+ 3591121934050292837ul,
229
+ 3739649723128072566ul,
230
+ 11338759925899470208ul,
231
+ 17557031182161531657ul,
232
+ 1328316363081986551ul,
233
+ 905104119772647733ul,
234
+ 16162805969666123858ul,
235
+ 13351191969939227039ul,
236
+ 11181921000405417530ul,
237
+ 1257129276560696939ul,
238
+ 8049492553042309720ul,
239
+ 8867122601488545729ul,
240
+ 8169023185794623188ul,
241
+ 14027174324336484013ul,
242
+ 3026556086188399794ul,
243
+ 7137339202398299406ul,
244
+ 15636400854018083176ul,
245
+ 1912758983363371197ul,
246
+ 12934014134659659938ul,
247
+ 6432162334519755563ul,
248
+ 11890239098696368321ul,
249
+ 465021739668949123ul,
250
+ 3571688800220472097ul,
251
+ 17356096479830501074ul,
252
+ 17244551474859817129ul,
253
+ 16858016682994011520ul,
254
+ 11599911656842386375ul,
255
+ 1384801604554958238ul,
256
+ 10350053496655489375ul,
257
+ 2028044935420165668ul,
258
+ 9321839731809955516ul,
259
+ 3800717409646038380ul,
260
+ 508616612214119935ul,
261
+ 14489270436014461891ul,
262
+ 11373150082561320490ul,
263
+ 8855221204049336307ul,
264
+ 11920562817555372746ul,
265
+ 17464569634060446109ul,
266
+ 146583913832133545ul,
267
+ 11454565731520647642ul,
268
+ 14516679283835536061ul,
269
+ };
270
+
271
+ // 64-bit left circular shift
272
+ hash_t rotate_left(hash_t value, int shift) {
273
+ return (value << shift) | (value >> (64 - shift));
274
+ }
275
+
276
+ hash_t hash_string(char* string) {
277
+ hash_t hash = 0;
278
+ int string_len = strlen(string);
279
+ int i;
280
+ for (i=0; i<string_len; i++) {
281
+ hash ^= CHAR2LONG[(unsigned char) string[i]];
282
+ hash = rotate_left(hash, 1);
283
+ }
284
+ return hash;
285
+ }
286
+
287
+ void buzhash_init_buffer(buzhash_buffer_t* buffer, int size) {
288
+ buffer->size = size;
289
+ buffer->elem_count = 0;
290
+ buffer->last_index = size - 1;
291
+ buffer->hash = 0;
292
+ buffer->elems = (hash_t*) malloc(size * sizeof(hash_t));
293
+ }
294
+
295
+ void buzhash_clear_buffer(buzhash_buffer_t* buffer) {
296
+ buffer->elem_count = 0;
297
+ buffer->last_index = buffer->size - 1;
298
+ buffer->hash = 0;
299
+ }
300
+
301
+ void buzhash_free_buffer(buzhash_buffer_t* buffer) {
302
+ free(buffer->elems);
303
+ }
304
+
305
+ int buzhash_is_full_buffer(buzhash_buffer_t* buffer) {
306
+ return (buffer->elem_count == buffer->size);
307
+ }
308
+
309
+ hash_t buzhash(char* string, buzhash_buffer_t* buffer) {
310
+ hash_t string_hash = hash_string(string);
311
+ if (buffer->elem_count < buffer->size) {
312
+ buffer->last_index = (buffer->last_index + 1) % buffer->size;
313
+ buffer->hash = rotate_left(buffer->hash, 1) ^ string_hash;
314
+ buffer->elems[buffer->last_index] = string_hash;
315
+ buffer->elem_count++;
316
+ }
317
+ else {
318
+ int fst_index = (buffer->last_index + 1) % buffer->size;
319
+ buffer->hash = rotate_left(buffer->hash, 1) ^ rotate_left(
320
+ buffer->elems[fst_index], buffer->size) ^ string_hash;
321
+ buffer->last_index = fst_index;
322
+ buffer->elems[buffer->last_index] = string_hash;
323
+ }
324
+ return buffer->hash;
325
+ }
@@ -0,0 +1,30 @@
1
+ /*********************************************************************
2
+ * Copyright (c) 2011-2015 Jan Pomikalek *
3
+ * All rights reserved. *
4
+ * *
5
+ * This software is licensed as described in the file COPYING, which *
6
+ * you should have received as part of this distribution. *
7
+ *********************************************************************/
8
+
9
+ #ifndef BUZHASH_H
10
+ #define BUZHASH_H
11
+ #define BUZHASH_MAX 18446744073709551615ul
12
+
13
+ #include <stdint.h>
14
+
15
+ typedef uint64_t hash_t;
16
+ typedef struct {
17
+ int size;
18
+ hash_t *elems;
19
+ int elem_count; // current number of elements in the buffer
20
+ int last_index; // the index of the last element (buffer is circular)
21
+ hash_t hash; // current hash value
22
+ } buzhash_buffer_t;
23
+
24
+ hash_t hash_string(char* string);
25
+ void buzhash_init_buffer(buzhash_buffer_t* buffer, int size);
26
+ void buzhash_clear_buffer(buzhash_buffer_t* buffer);
27
+ void buzhash_free_buffer(buzhash_buffer_t* buffer);
28
+ int buzhash_is_full_buffer(buzhash_buffer_t* buffer);
29
+ hash_t buzhash(char* string, buzhash_buffer_t* buffer);
30
+ #endif
@@ -0,0 +1,172 @@
1
+ /*********************************************************************
2
+ * Copyright (c) 2011-2015 Jan Pomikalek *
3
+ * All rights reserved. *
4
+ * *
5
+ * This software is licensed as described in the file COPYING, which *
6
+ * you should have received as part of this distribution. *
7
+ *********************************************************************/
8
+
9
+ #include <errno.h>
10
+ #include <fcntl.h>
11
+ #include <stdio.h>
12
+ #include <stdlib.h>
13
+ #include <string.h>
14
+ #include <time.h>
15
+ #include <unistd.h>
16
+ #include <sys/mman.h>
17
+ #include <sys/time.h>
18
+ #include <sys/stat.h>
19
+ #include <sys/types.h>
20
+ #include "buzhash.h"
21
+ #include "version.h"
22
+
23
+ #define OUTPUT_FILE "duphashes"
24
+
25
+ // options
26
+ char *Output_file = OUTPUT_FILE;
27
+ int Quiet = 0;
28
+ FILE* Input;
29
+ long int Input_size;
30
+
31
+ void print_usage(FILE *stream) {
32
+ fprintf(stream, "\
33
+ Usage: hashdup [OPTIONS] FILE [FILE...]\n\
34
+ Identify duplicate hashes.\n\
35
+ \n\
36
+ -o FILE output file (default: %s)\n\
37
+ -q quiet; suppress all output except for errors\n\
38
+ \n\
39
+ -V print version information and exit\n\
40
+ -h display this help and exit\n\
41
+ \n\
42
+ Project home page: <http://code.google.com/p/onion/>\n",
43
+ OUTPUT_FILE);
44
+ }
45
+
46
+ // taken from http://cs.wikipedia.org/wiki/Quicksort
47
+ void quicksort(hash_t array[], long int left_begin, long int right_begin) {
48
+ hash_t pm = array[(left_begin + right_begin) / 2];
49
+ long int left_index, right_index;
50
+ left_index = left_begin;
51
+ right_index = right_begin;
52
+ do {
53
+ while (array[left_index] < pm)
54
+ left_index++;
55
+ while (array[right_index] > pm)
56
+ right_index--;
57
+ if (left_index <= right_index) {
58
+ hash_t value = array[left_index];
59
+ array[left_index] = array[right_index];
60
+ array[right_index] = value;
61
+ left_index++;
62
+ right_index--;
63
+ }
64
+ } while (left_index < right_index);
65
+ if (right_index > left_begin)
66
+ quicksort(array, left_begin, right_index);
67
+ if (left_index < right_begin)
68
+ quicksort(array, left_index, right_begin);
69
+ }
70
+
71
+ void print_progress(int processed_files, int total_files) {
72
+ time_t now;
73
+ time(&now);
74
+ fprintf(stderr, "[%.24s] hashdup: %i / %i files processed\n", ctime(&now),
75
+ processed_files, total_files);
76
+ }
77
+
78
+ int main(int argc, char **argv) {
79
+ // get options
80
+ int c;
81
+ while ((c = getopt(argc, argv, "o:qVh")) != -1) {
82
+ errno = 0;
83
+ switch (c) {
84
+ case 'o':
85
+ Output_file = optarg;
86
+ break;
87
+ case 'q':
88
+ Quiet = 1;
89
+ break;
90
+ case 'V':
91
+ print_version("hashdup");
92
+ return 0;
93
+ case 'h':
94
+ print_usage(stdout);
95
+ return 0;
96
+ case '?':
97
+ print_usage(stderr);
98
+ return 1;
99
+ }
100
+ }
101
+
102
+ if (optind >= argc) {
103
+ fprintf(stderr, "No input.\n");
104
+ print_usage(stderr);
105
+ return 1;
106
+ }
107
+
108
+ // output file
109
+ errno = 0;
110
+ FILE* output_fp = fopen(Output_file, "w");
111
+ if (errno != 0) {
112
+ fprintf(stderr, "Unable to open %s for writing.\n", Output_file);
113
+ return 1;
114
+ }
115
+
116
+ int input_files_count = argc - optind;
117
+
118
+ // for all input files
119
+ int i;
120
+ for (i=optind; i<argc; i++) {
121
+ // open file
122
+ char* filename = argv[i];
123
+ int input_fd = open(filename, O_RDONLY);
124
+ if (input_fd == -1) {
125
+ fprintf(stderr, "Unable to open %s for reading.\n", filename);
126
+ return 1;
127
+ }
128
+
129
+ // determine file size
130
+ unsigned long int file_size = lseek(input_fd, 0L, SEEK_END);
131
+ lseek(input_fd, 0L, SEEK_SET);
132
+
133
+ // map hashes into memory
134
+ hash_t* hashes = NULL;
135
+ hashes = (hash_t*) mmap(hashes, file_size, PROT_READ | PROT_WRITE,
136
+ MAP_PRIVATE, input_fd, 0);
137
+
138
+ // sort hashes
139
+ unsigned long int hash_count = file_size / sizeof(hash_t);
140
+ quicksort(hashes, 0, hash_count-1);
141
+
142
+ // send duplicate hashes to the output
143
+ int written = 0;
144
+ hash_t prev_hash = hashes[0];
145
+ hash_t hash;
146
+ unsigned long int j;
147
+ for (j=1; j<hash_count; j++) {
148
+ hash = hashes[j];
149
+ if (hash == prev_hash) {
150
+ if (!written) {
151
+ fwrite(&hash, sizeof(hash), 1, output_fp);
152
+ written = 1;
153
+ }
154
+ }
155
+ else {
156
+ written = 0;
157
+ }
158
+ prev_hash = hash;
159
+ }
160
+
161
+ munmap(hashes, file_size);
162
+ close(input_fd);
163
+
164
+ // print progress information
165
+ if (!Quiet)
166
+ print_progress(i - optind + 1, input_files_count);
167
+ }
168
+
169
+ fclose(output_fp);
170
+
171
+ return 0;
172
+ }