dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,854 @@
1
+ /*********************************************************************
2
+ * Copyright (c) 2011-2020 Jan Pomikalek, Milos Jakubicek *
3
+ * All rights reserved. *
4
+ * *
5
+ * This software is licensed as described in the file COPYING, which *
6
+ * you should have received as part of this distribution. *
7
+ *********************************************************************
8
+ * *
9
+ * Refactored in 2023 by Sally Choker and Fadi Zaraket. *
10
+ * Modifications: *
11
+ * - Adapted to take as input a list of file paths instead of *
12
+ * a single file. *
13
+ * - Enhanced to detect the source of duplicate files *
14
+ * - Calculate the score of duplication from each duplicate file *
15
+ *********************************************************************/
16
+
17
+ #include "buzhash.h"
18
+ #include "version.h"
19
+ #include <algorithm>
20
+ #include <errno.h>
21
+ #include <fstream>
22
+ #include <iostream>
23
+ #include <sstream>
24
+ #include <stdio.h>
25
+ #include <string.h>
26
+ #include <sys/resource.h>
27
+ #include <sys/stat.h>
28
+ #include <sys/time.h>
29
+ #include <time.h>
30
+ #include <unistd.h>
31
+ #include <unordered_map>
32
+ #include <vector>
33
+
34
+ #define GOOGLE_SPARSE
35
+
36
+ #if defined GOOGLE_SPARSE
37
+ #include <sparsehash/sparse_hash_map>
38
+ // #include <sparsehash/sparse_hash_set>
39
+ // using google::sparse_hash_set;
40
+ #elif defined __GNUC__ || defined __APPLE__
41
+ #include <ext/hash_map>
42
+ namespace std { using namespace __gnu_cxx; }
43
+ #else
44
+ #include <hash_map>
45
+ #endif
46
+ using namespace std;
47
+
48
+ #ifdef GOOGLE_SPARSE
49
+ // typedef sparse_hash_set<uint64_t> ngrhash;
50
+ typedef google::sparse_hash_map<uint64_t, int> ngrhash;
51
+ #else
52
+ // typedef hash_map<uint64_t,bool> ngrhash;
53
+ typedef hash_map<uint64_t, int> ngrhash;
54
+ #endif
55
+
56
+ // initialize vector to store pair filenames and duplicates of all files
57
+ std::vector<std::pair<string, int>> foundFilenames;
58
+
59
+ // initailize vector to store only pairs of duplicated filenames
60
+ std::vector<std::pair<string, int>> DuplicateFilenames;
61
+ // initialize a vector to store documents not checked for deduplication (don't
62
+ // fit in the buffer)
63
+ std::vector<int> UncheckedFilenames;
64
+
65
+ // initialize a vector to stroe duplicate filenames and their score:
66
+ // bad_tokens/total_tokens
67
+ std::vector<std::pair<string, float>> DuplicateFilenamesScores;
68
+
69
+ #define BITMASK_HIGH63 0xfffffffffffffffeul
70
+
71
+ #define NGRAM_SIZE 10
72
+ #define DUPL_THRES 0.9
73
+ #define DOC_TAG "doc"
74
+ #define PAR_TAG "p"
75
+ #define TRIM_HASHES 64
76
+ #define MAX_STUB_LENGTH 20
77
+ #define BUFFER_SIZE 335544320
78
+
79
+ // options
80
+ int Ngram_size = NGRAM_SIZE;
81
+ float Dupl_thres = DUPL_THRES;
82
+ const char *Doc_tag = DOC_TAG;
83
+ const char *Par_tag = PAR_TAG;
84
+ int Strip_dupl = 0;
85
+ int No_smoothing = 0;
86
+ int Trim_hashes = TRIM_HASHES;
87
+ int Max_stub_length = MAX_STUB_LENGTH;
88
+ long Buffer_size = BUFFER_SIZE;
89
+ const char *output_dir = NULL;
90
+ int Quiet = 0;
91
+ char *Dupl_hashes_path = NULL;
92
+ FILE *Input;
93
+ long int Input_size;
94
+ char current_file_name[2048];
95
+ int fileNameIndex = 0;
96
+
97
+ void print_usage(FILE *stream) {
98
+ fprintf(stream, "\
99
+ Usage: onion [OPTIONS] [FILE]\n\
100
+ Mark duplicate text parts in the input vertical file.\n\
101
+ \n\
102
+ -f FILE hashes of duplicate n-grams\n\
103
+ -n NUM n-gram length (default: %i)\n\
104
+ -t NUM duplicate content threshold (default: %.1f)\n\
105
+ -d STR document tag (default: %s)\n\
106
+ -p STR paragraph tag (default: %s)\n\
107
+ -s strip duplicate parts (rather than mark)\n\
108
+ -m no smoothing\n\
109
+ -T NUM trim n-gram hashes to NUM bits (default: %i)\n\
110
+ -l NUM max stub length (default: %i)\n\
111
+ -b NUM buffer size, in bytes (default: %i)\n\
112
+ -q quiet; suppress all output except for errors\n\
113
+ \n\
114
+ -V print version information and exit\n\
115
+ -h display this help and exit\n\
116
+ \n\
117
+ With no FILE, or when FILE is -, read standard input.\n\
118
+ Output is written to standard output.\n\
119
+ \n\
120
+ Project home page: <http://code.google.com/p/onion/>\n",
121
+ NGRAM_SIZE, DUPL_THRES, DOC_TAG, PAR_TAG, TRIM_HASHES,
122
+ MAX_STUB_LENGTH, BUFFER_SIZE);
123
+ }
124
+
125
+ void print_progress(const char *task_descr, unsigned long int processed_bytes,
126
+ float percent_done) {
127
+ time_t now;
128
+ struct rusage usage;
129
+ time(&now);
130
+ getrusage(RUSAGE_SELF, &usage);
131
+ fprintf(stderr, "[%.24s] onion: %s: %6li MB processed", ctime(&now),
132
+ task_descr, processed_bytes / (1024 * 1024));
133
+ if (percent_done >= 0)
134
+ fprintf(stderr, " (%6.2f%%)", percent_done);
135
+ fprintf(stderr, "\t%6li MB RAM used", usage.ru_maxrss / 1024);
136
+ fprintf(stderr, "\n");
137
+ }
138
+
139
+ void saveGlobalHashmap(const ngrhash &global, const std::string &filename) {
140
+ std::ofstream outFile(filename.c_str(), std::ios::binary | std::ios::app);
141
+ if (!outFile.is_open()) {
142
+ throw std::runtime_error("Failed to open file for writing");
143
+ }
144
+
145
+ for (const auto &pair : global) {
146
+ outFile.write(reinterpret_cast<const char *>(&pair.first),
147
+ sizeof(pair.first));
148
+ outFile.write(reinterpret_cast<const char *>(&pair.second),
149
+ sizeof(pair.second));
150
+ }
151
+
152
+ outFile.close();
153
+ }
154
+
155
+ bool fileExists(const std::string &name) {
156
+ struct stat map;
157
+ return (stat(name.c_str(), &map) == 0);
158
+ }
159
+
160
+ void writeFilenameDuplicateToCSV(
161
+ const std::vector<std::pair<std::string, int>> &vec,
162
+ const std::string &fullPath) {
163
+ std::ifstream infile(fullPath);
164
+ bool fileExists = infile.is_open();
165
+ infile.close();
166
+
167
+ std::ofstream file(fullPath, fileExists ? std::ios::app : std::ios::out);
168
+ if (!file.is_open()) {
169
+ std::cerr << "Failed to open the file: " << fullPath << std::endl;
170
+ return;
171
+ }
172
+
173
+ for (const auto &pair : vec) {
174
+ file << "\"" << pair.first << "\"," << pair.second << "\n";
175
+ }
176
+
177
+ file.close();
178
+ }
179
+
180
+ void writeFilenameDuplicateScoreToCSV(
181
+ const std::vector<std::pair<std::string, float>> &vec,
182
+ const std::string &fullPath) {
183
+
184
+ std::ifstream infile(fullPath);
185
+ bool fileExists = infile.is_open();
186
+ infile.close();
187
+
188
+ std::ofstream file(fullPath, fileExists ? std::ios::app : std::ios::out);
189
+
190
+ if (!file.is_open()) {
191
+ std::cerr << "Failed to open the file: " << fullPath << std::endl;
192
+ return;
193
+ }
194
+
195
+ for (const auto &pair : vec) {
196
+ file << pair.first << "," << pair.second << "\n";
197
+ }
198
+ }
199
+
200
+ void writeUncheckedFilenamesToCSV(const std::vector<int> &vec,
201
+ const std::string &fullPath) {
202
+ std::ifstream infile(fullPath);
203
+ bool fileExists = infile.is_open();
204
+ infile.close();
205
+
206
+ std::ofstream file(fullPath, fileExists ? std::ios::app : std::ios::out);
207
+ if (!file.is_open()) {
208
+ std::cerr << "Failed to open the file: " << fullPath << std::endl;
209
+ return;
210
+ }
211
+
212
+ for (const auto &num : vec) {
213
+ file << num << "\n";
214
+ }
215
+
216
+ file.close();
217
+ }
218
+
219
+ int process_one_par(int *&pars, char **&tokens, buzhash_buffer_t &bh_buffer,
220
+ hash_t &hash_bitmask, ngrhash &local, ngrhash &global,
221
+ int &bad_tokens, int &tok_i, int &par_i, int &total_tokens,
222
+ int &prev_bad_tokens, int &have_dupl_ngrams) {
223
+ ngrhash::const_iterator it;
224
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
225
+ char *token = tokens[tok_i];
226
+ if (token[0] == '<')
227
+ continue;
228
+ total_tokens++;
229
+ // prev_bad_tokens--;
230
+ // if (prev_bad_tokens < 0)
231
+ // prev_bad_tokens = 0;
232
+ hash_t hash = buzhash(token, &bh_buffer);
233
+ hash_t masked_hash = hash & hash_bitmask;
234
+ if (!buzhash_is_full_buffer(&bh_buffer))
235
+ continue;
236
+ it = local.find(hash);
237
+ if (it == local.end()) {
238
+ if (have_dupl_ngrams) {
239
+ // test with the last bit set to 1
240
+ // // (check against already seen duplicate ngrams)
241
+ it = global.find(masked_hash | 1);
242
+ } else {
243
+ it = global.find(masked_hash);
244
+ }
245
+ }
246
+ if (it != global.end()) {
247
+ bad_tokens++; // bad_tokens += Ngram_size - prev_bad_tokens;
248
+ // prev_bad_tokens = Ngram_size;
249
+ // if (fileNameIndex != it->second) {
250
+ foundFilenames.push_back(
251
+ make_pair(current_file_name, it->second)); // Save the filename.
252
+ //}
253
+ }
254
+ #ifdef GOOGLE_SPARSE
255
+ local.insert(std::make_pair(hash, fileNameIndex)); // local.insert(hash);
256
+ #else
257
+ local[hash] = fileNameIndex; // local[hash] = true;
258
+ #endif
259
+ }
260
+ }
261
+
262
+ int process_one_file(int &buffer_content, char **&tokens, int *&pars,
263
+ int *&par_len, char *&bad_par,
264
+
265
+ int *docs,
266
+
267
+ char *doc_tag, int doc_tag_len, char *doc_end_tag,
268
+ int doc_end_tag_len, char *par_tag, int par_tag_len,
269
+ char *par_end_tag, int par_end_tag_len,
270
+
271
+ int &have_dupl_ngrams,
272
+ unsigned long int &total_processed_bytes,
273
+
274
+ // make sure these become mutable
275
+ ngrhash &global, ngrhash &local,
276
+ buzhash_buffer_t &bh_buffer, char *buffer) {
277
+ int bytes_read = fread(buffer + buffer_content, sizeof(char),
278
+ Buffer_size - buffer_content, Input);
279
+ // print("Buffer %x, BufferContent %d, BufferSize %ld, \n", buffer,
280
+ // buffer_content, Buffer_size);
281
+ hash_t hash_bitmask = 0xfffffffffffffffful;
282
+
283
+ int buffer_size = buffer_content + bytes_read;
284
+ buffer[buffer_size] = '\0'; // make it a string
285
+ char *buffer_pos = buffer;
286
+
287
+ // find tokens
288
+ int token_count = 0;
289
+ tokens[token_count++] = buffer_pos++;
290
+ while ((buffer_pos = strchr(buffer_pos, '\n')) != NULL) {
291
+ buffer_pos[0] = '\0';
292
+ tokens[token_count++] = ++buffer_pos;
293
+ }
294
+
295
+ // find docs and paragraphs
296
+ int doc_count = 0;
297
+ int par_count = 0;
298
+ docs[doc_count++] = 0;
299
+ pars[par_count++] = 0;
300
+ int start_doc_next = 0;
301
+ int start_par_next = 0;
302
+ int i;
303
+
304
+ for (i = 1; i < token_count; i++) {
305
+ // "<doc>" or "<doc "
306
+ if (start_doc_next ||
307
+ (strncmp(tokens[i], doc_tag, doc_tag_len) == 0 &&
308
+ (tokens[i][doc_tag_len] == ' ' || tokens[i][doc_tag_len] == '>'))) {
309
+ docs[doc_count++] = par_count;
310
+ pars[par_count++] = i;
311
+ start_doc_next = 0;
312
+ }
313
+ // "</doc>"
314
+ else if (strncmp(tokens[i], doc_end_tag, doc_end_tag_len) == 0) {
315
+ start_doc_next = 1;
316
+ }
317
+ // "<p>" or "<p "
318
+ else if (start_par_next || (strncmp(tokens[i], par_tag, par_tag_len) == 0 &&
319
+ (tokens[i][par_tag_len] == ' ' ||
320
+ tokens[i][par_tag_len] == '>'))) {
321
+ pars[par_count++] = i;
322
+ start_par_next = 0;
323
+ }
324
+ // "</p>"
325
+ else if (strncmp(tokens[i], par_end_tag, par_end_tag_len) == 0) {
326
+ start_par_next = 1;
327
+ }
328
+ }
329
+
330
+ if (doc_count == 1 && !feof(Input)) {
331
+ // full buffer contains only one document
332
+ // by default, documents with higher than 16MB are not checked for dedup
333
+ fprintf(stderr, "Too long document at byte %li.\n", total_processed_bytes);
334
+ UncheckedFilenames.push_back(fileNameIndex);
335
+ return 1;
336
+ }
337
+
338
+ if (feof(Input)) {
339
+ // create sentinels
340
+ docs[doc_count++] = par_count;
341
+ if (strlen(tokens[token_count - 1]) == 0) {
342
+ // files ending with a newline have a natural sentinel
343
+ // (the last zero-length token)
344
+ pars[par_count++] = token_count - 1;
345
+ } else {
346
+ // for other files, we need to make up the last token
347
+ pars[par_count++] = token_count;
348
+ tokens[token_count++] = buffer + buffer_size;
349
+ }
350
+ }
351
+
352
+ // for all documents
353
+ int doc_i;
354
+ for (doc_i = 0; doc_i < doc_count - 1; doc_i++) {
355
+ buzhash_clear_buffer(&bh_buffer);
356
+ local.clear();
357
+ // for all paragraphs in the document
358
+ int par_i;
359
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
360
+ int total_tokens = 0;
361
+ int bad_tokens = 0;
362
+ /* prev_bad_tokens is the number of tokens in the current
363
+ * n-gram which are contained in one of the previous bad
364
+ * n-grams.
365
+ *
366
+ * At the beginning of a new paragraph we need to pretend that
367
+ * there are Ngram_size prev_bad_tokens so that the leading
368
+ * bad n-grams do not generate too many bod tokens. */
369
+ int prev_bad_tokens = Ngram_size;
370
+ // for all tokens in the paragraph
371
+ // for every token in the current paragraph, it computes its hash and
372
+ // checks if it is present in the local and global hash maps
373
+ int tok_i;
374
+ float score;
375
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
376
+ process_one_par(pars, tokens, bh_buffer, hash_bitmask, local, global,
377
+ bad_tokens, tok_i, par_i, total_tokens, prev_bad_tokens,
378
+ have_dupl_ngrams);
379
+ }
380
+
381
+ // remember the length of the paragraph
382
+ par_len[par_i] = total_tokens;
383
+
384
+ // mark bad paragraphs
385
+ bad_par[par_i] =
386
+ (total_tokens > 0 && (1.0 * bad_tokens / total_tokens) > Dupl_thres);
387
+
388
+ // get score for each file
389
+ score = (1.0 * bad_tokens / total_tokens);
390
+ if (score > Dupl_thres) {
391
+ DuplicateFilenamesScores.push_back(make_pair(current_file_name, score));
392
+ }
393
+ // DuplicateFilenamesScores.push_back(make_pair(fileNameIndex, score));
394
+ }
395
+
396
+ // smoothing
397
+ if (!No_smoothing) {
398
+ int last_bad_par = docs[doc_i] - 1;
399
+ int stub_length = 0;
400
+ for (par_i = docs[doc_i]; par_i <= docs[doc_i + 1]; par_i++) {
401
+ if (par_i == docs[doc_i + 1] || bad_par[par_i]) {
402
+ if (stub_length <= Max_stub_length) {
403
+ // remove stub
404
+ int par_j;
405
+ for (par_j = last_bad_par + 1; par_j < par_i; par_j++)
406
+ bad_par[par_j] = 1;
407
+ }
408
+ last_bad_par = par_i;
409
+ stub_length = 0;
410
+ } else {
411
+ stub_length += par_len[par_i];
412
+ }
413
+ }
414
+ }
415
+
416
+ int count_bad = 0;
417
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
418
+ if (!bad_par[par_i] && par_len[par_i] > 0) {
419
+ } else {
420
+ count_bad += 1;
421
+ }
422
+ }
423
+ if (count_bad > 0) {
424
+ std::sort(foundFilenames.begin(), foundFilenames.end());
425
+ // auto uniqueEnd = std::unique(foundFilenames.begin(),
426
+ // foundFilenames.end()); foundFilenames.erase(uniqueEnd,
427
+ // foundFilenames.end());
428
+ printf("%s is %d bad\n", current_file_name, count_bad);
429
+ DuplicateFilenames.insert(DuplicateFilenames.end(),
430
+ foundFilenames.begin(), foundFilenames.end());
431
+ // DuplicateFilenames.push_back(current_file_name);
432
+ }
433
+
434
+ foundFilenames.clear();
435
+
436
+ // is there at least one good paragraph?
437
+ int all_bad = 1;
438
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
439
+ if (!bad_par[par_i] && par_len[par_i] > 0) {
440
+ all_bad = 0;
441
+ break;
442
+ }
443
+ }
444
+
445
+ buzhash_clear_buffer(&bh_buffer);
446
+ // for all paragraphs in the document (again)
447
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
448
+ int first_token = pars[docs[doc_i]];
449
+ int last_token = pars[docs[doc_i + 1]] - 1;
450
+ // for all tokens in the paragraph
451
+ int tok_i;
452
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
453
+ char *token = tokens[tok_i];
454
+ int bad_token = bad_par[par_i];
455
+ // tags at document boundaries are preserved if there is
456
+ // at least one good paragraph, removed otherwise
457
+ if (tok_i == first_token && strncmp(token, doc_tag, doc_tag_len) == 0)
458
+ bad_token = all_bad;
459
+ if (tok_i == last_token && strcmp(token, doc_end_tag) == 0)
460
+ bad_token = all_bad;
461
+ // print output
462
+ // if (Strip_dupl) {
463
+ // if (!bad_token)
464
+ // printf("%s\n", token);
465
+ //}
466
+ // else {
467
+ // printf("%i\t%s\n", bad_token, token);
468
+ //}
469
+ if (token[0] == '<')
470
+ continue;
471
+ // store hashes of n-grams
472
+ hash_t hash = buzhash(token, &bh_buffer);
473
+ hash_t masked_hash = hash & hash_bitmask;
474
+ if (!buzhash_is_full_buffer(&bh_buffer))
475
+ continue;
476
+ if (!bad_par[par_i]) {
477
+ if (have_dupl_ngrams) {
478
+ // If we have the list of hashes of all duplicate
479
+ // n-grams, we set the least significant bit of the
480
+ // stored hash to 1 if we have seen the matching
481
+ // duplicate n-gram to indicate it has been seen.
482
+ // Unique n-grams are ignored.
483
+ if (global.erase(masked_hash & BITMASK_HIGH63))
484
+ #ifdef GOOGLE_SPARSE
485
+ global.insert(std::make_pair(masked_hash | 1, fileNameIndex));
486
+ // global.insert (masked_hash | 1);
487
+ #else
488
+ global[masked_hash | 1] =
489
+ fileNameIndex; // global[masked_hash | 1] = true;
490
+ #endif
491
+ } else {
492
+ // otherwise we have to store hashes of all n-grams
493
+ #ifdef GOOGLE_SPARSE
494
+ global.insert(std::make_pair(
495
+ masked_hash, fileNameIndex)); // global.insert(masked_hash);
496
+ #else
497
+ global[masked_hash] = fileNameIndex; // global[masked_hash] = true;
498
+ #endif
499
+ }
500
+ }
501
+ }
502
+ }
503
+ }
504
+
505
+ // copy the unprocessed data to the beginning of the buffer
506
+ if (!feof(Input)) {
507
+ char *last_doc_fst_tok = tokens[pars[docs[doc_count - 1]]];
508
+ int processed_bytes = last_doc_fst_tok - buffer;
509
+ total_processed_bytes += processed_bytes;
510
+ int remaining_bytes = buffer_size - processed_bytes;
511
+ char *buffer_end = last_doc_fst_tok + remaining_bytes;
512
+ char *pos;
513
+ // replace \0s with EOLs (revert the buffer contents to original)
514
+ for (pos = last_doc_fst_tok; pos < buffer_end; pos++)
515
+ if (pos[0] == '\0')
516
+ pos[0] = '\n';
517
+ // print progress information
518
+ if (!Quiet) {
519
+ float percent_done = -1;
520
+ if (Input_size > 0)
521
+ percent_done = 100.0 * total_processed_bytes / Input_size;
522
+ print_progress("removing duplicates", total_processed_bytes,
523
+ percent_done);
524
+ }
525
+ memmove(buffer, last_doc_fst_tok, remaining_bytes);
526
+ buffer_content = remaining_bytes;
527
+ }
528
+ printf("end of process_one_file\n");
529
+ }
530
+
531
+ int main(int argc, char **argv) {
532
+ printf("Dupl Threshold %f \n", Dupl_thres);
533
+ printf("N-gram size %d \n", Ngram_size);
534
+ // get options
535
+ int c;
536
+ char *endptr;
537
+ char *datasetname = NULL;
538
+ char *listOfFilesPath = NULL;
539
+
540
+ while ((c = getopt(argc, argv, "f:n:t:d:p:smT:l:b:qVhD:L:O:")) != -1) {
541
+ errno = 0;
542
+ switch (c) {
543
+ case 'f':
544
+ Dupl_hashes_path = optarg;
545
+ break;
546
+ case 'n':
547
+ Ngram_size = strtol(optarg, &endptr, 10);
548
+ if (errno != 0 || *endptr != '\0') {
549
+ fprintf(stderr, "Integer value expected for -n, got: %s\n", optarg);
550
+ print_usage(stderr);
551
+ return 1;
552
+ }
553
+ break;
554
+ case 't':
555
+ Dupl_thres = strtod(optarg, &endptr);
556
+ if (errno != 0 || *endptr != '\0') {
557
+ fprintf(stderr, "Float value expected for -t, got: %s\n", optarg);
558
+ print_usage(stderr);
559
+ return 1;
560
+ }
561
+ break;
562
+ case 'd':
563
+ Doc_tag = optarg;
564
+ break;
565
+ case 'p':
566
+ Par_tag = optarg;
567
+ break;
568
+ case 's':
569
+ Strip_dupl = 1;
570
+ break;
571
+ case 'm':
572
+ No_smoothing = 1;
573
+ break;
574
+ case 'T':
575
+ Trim_hashes = strtol(optarg, &endptr, 10);
576
+ if (errno != 0 || *endptr != '\0') {
577
+ fprintf(stderr, "Integer value expected for -T, got: %s\n", optarg);
578
+ print_usage(stderr);
579
+ return 1;
580
+ }
581
+ break;
582
+ case 'l':
583
+ Max_stub_length = strtol(optarg, &endptr, 10);
584
+ if (errno != 0 || *endptr != '\0') {
585
+ fprintf(stderr, "Integer value expected for -l, got: %s\n", optarg);
586
+ print_usage(stderr);
587
+ return 1;
588
+ }
589
+ break;
590
+ case 'b':
591
+ Buffer_size = strtol(optarg, &endptr, 10);
592
+ if (errno != 0 || *endptr != '\0') {
593
+ fprintf(stderr, "Integer value expected for -b, got: %s\n", optarg);
594
+ print_usage(stderr);
595
+ return 1;
596
+ }
597
+ break;
598
+ case 'q':
599
+ Quiet = 1;
600
+ break;
601
+ case 'V':
602
+ print_version("onion");
603
+ return 0;
604
+ case 'h':
605
+ print_usage(stdout);
606
+ return 0;
607
+ case '?':
608
+ print_usage(stderr);
609
+ return 1;
610
+ case 'D': // Dataset name
611
+ datasetname = optarg;
612
+ break;
613
+ case 'L': // List of files path
614
+ listOfFilesPath = optarg;
615
+ break;
616
+ case 'O': // Output directory
617
+ output_dir = optarg;
618
+ break;
619
+ }
620
+ }
621
+
622
+ Input = stdin;
623
+ Input_size = -1;
624
+ if (optind < argc) {
625
+ char *filename = argv[optind];
626
+ if (strcmp(filename, "-") != 0) {
627
+ errno = 0;
628
+ Input = fopen(filename, "r");
629
+ if (errno != 0) {
630
+ fprintf(stderr, "Unable to open %s for reading.\n", filename);
631
+ return 1;
632
+ }
633
+ fseek(Input, 0L, SEEK_END);
634
+ Input_size = ftell(Input);
635
+ fseek(Input, 0L, SEEK_SET);
636
+ }
637
+ }
638
+
639
+ if (output_dir != NULL) {
640
+ struct stat st = {0};
641
+ if (stat(output_dir, &st) == -1) {
642
+ if (mkdir(output_dir, 0700) != 0) {
643
+ perror("Unable to create output directory");
644
+ return 1;
645
+ }
646
+ }
647
+ } else {
648
+ fprintf(stderr, "You must specify an output directory.\n");
649
+ print_usage(stderr);
650
+ return 1;
651
+ }
652
+
653
+ if (datasetname == NULL || listOfFilesPath == NULL) {
654
+ fprintf(stderr, "You must specify dataset name and list of files path.\n");
655
+ print_usage(stderr);
656
+ return 1;
657
+ }
658
+
659
+ FILE *List_of_Files = fopen(listOfFilesPath, "r");
660
+ if (List_of_Files == NULL) {
661
+ perror("Unable to open list of files");
662
+ return 1;
663
+ }
664
+
665
+ unsigned long int total_processed_bytes = 0;
666
+
667
+ // patterns
668
+ char *doc_tag = (char *)malloc((strlen(Doc_tag) + 1 + 1) * sizeof(char));
669
+ strcat(strcpy(doc_tag, "<"), Doc_tag);
670
+ char *doc_end_tag = (char *)malloc((strlen(Doc_tag) + 3 + 1) * sizeof(char));
671
+ strcat(strcat(strcpy(doc_end_tag, "</"), Doc_tag), ">");
672
+ char *par_tag = (char *)malloc((strlen(Par_tag) + 1 + 1) * sizeof(char));
673
+ strcat(strcpy(par_tag, "<"), Par_tag);
674
+ char *par_end_tag = (char *)malloc((strlen(Par_tag) + 3 + 1) * sizeof(char));
675
+ strcat(strcat(strcpy(par_end_tag, "</"), Par_tag), ">");
676
+
677
+ int doc_tag_len = strlen(doc_tag);
678
+ int doc_end_tag_len = strlen(doc_end_tag);
679
+ int par_tag_len = strlen(par_tag);
680
+ int par_end_tag_len = strlen(par_end_tag);
681
+
682
+ // bitmask for trimming ngram hashes
683
+ hash_t hash_bitmask = 0xfffffffffffffffful;
684
+ int bitshift = 64 - Trim_hashes;
685
+ if (bitshift > 0)
686
+ hash_bitmask >>= bitshift;
687
+
688
+ // data structures
689
+ int buffer_size = 0;
690
+ int buffer_content = 0;
691
+ char *buffer = (char *)malloc((Buffer_size + 1) * sizeof(char));
692
+ char **tokens = (char **)malloc((Buffer_size + 1) * sizeof(char *));
693
+ int *pars = (int *)malloc((Buffer_size + 1) *
694
+ sizeof(int)); // array of starting tokens
695
+ int *par_len = (int *)malloc((Buffer_size + 1) * sizeof(int));
696
+ char *bad_par = (char *)malloc((Buffer_size + 1) * sizeof(char));
697
+ int *docs =
698
+ (int *)malloc((Buffer_size + 1) * sizeof(int)); // array of starting pars
699
+ int token_count, par_count, doc_count;
700
+
701
+ // buzhash
702
+ buzhash_buffer_t bh_buffer;
703
+ buzhash_init_buffer(&bh_buffer, Ngram_size);
704
+
705
+ // global hash table stores the hashes of all files read so far (in our case
706
+ // the full document since we did not mark paragraph separators) local hash
707
+ // table stores the hahses of n-grams found within the currently processed
708
+ // file (doc as a whole in our case)
709
+ ngrhash global, local;
710
+ #ifdef GOOGLE_SPARSE
711
+ global.set_deleted_key(0);
712
+ local.set_deleted_key(0);
713
+ #endif
714
+
715
+ // read hashes of duplicate n-grams if available
716
+ int have_dupl_ngrams = 0;
717
+ if (Dupl_hashes_path != NULL) {
718
+ have_dupl_ngrams = 1;
719
+ errno = 0;
720
+ FILE *ngrams_fp = fopen(Dupl_hashes_path, "r");
721
+ if (errno != 0) {
722
+ fprintf(stderr, "Unable to open %s for reading.\n", Dupl_hashes_path);
723
+ return 1;
724
+ }
725
+ fseek(ngrams_fp, 0L, SEEK_END);
726
+ unsigned long int ngrams_size = ftell(ngrams_fp);
727
+ fseek(ngrams_fp, 0L, SEEK_SET);
728
+
729
+ unsigned long int bytes_read = 0;
730
+ hash_t hash;
731
+ while (fread(&hash, sizeof(hash), 1, ngrams_fp)) {
732
+ printf("reading");
733
+ bytes_read += sizeof(hash);
734
+ hash_t masked_hash = hash & hash_bitmask;
735
+ // store only the 63 most significant bits of the hash;
736
+ // reserve the last bit as a flag (seen / unseen)
737
+ // #ifdef GOOGLE_SPARSE
738
+ // global.insert(std::make_pair(masked_hash | 1,
739
+ // fileNameIndex));//global.insert (masked_hash & BITMASK_HIGH63); #else
740
+ // global[masked_hash & BITMASK_HIGH63] = true;
741
+ // #endif
742
+
743
+ // print progress information
744
+ if (!Quiet && bytes_read % (10000000 * sizeof(hash)) == 0) {
745
+ float percent_done = -1;
746
+ if (ngrams_size > 0)
747
+ percent_done = 100.0 * bytes_read / ngrams_size;
748
+ print_progress("reading hashes", bytes_read, percent_done);
749
+ }
750
+ }
751
+
752
+ if (!Quiet)
753
+ print_progress("reading hashes", bytes_read, 100);
754
+ }
755
+
756
+ int fileIndex =
757
+ 0; // File index t store every 300 duplicates in a new csv file
758
+
759
+ while (!feof(List_of_Files)) {
760
+ fileNameIndex++;
761
+ /* read the name of each file in the list of files*/
762
+ if (fgets(current_file_name, sizeof(current_file_name), List_of_Files) ==
763
+ NULL) {
764
+ break;
765
+ }
766
+ int last_ch = current_file_name[strlen(current_file_name) - 1];
767
+ if (last_ch == '\n') {
768
+ current_file_name[strlen(current_file_name) - 1] = '\0';
769
+ }
770
+ /* use fopen to assign Input to it*/
771
+ Input = fopen(current_file_name, "r");
772
+ if (Input == NULL) {
773
+ char err_msg[1024];
774
+ snprintf(err_msg, 1023, "Error to open data file %s--",
775
+ current_file_name);
776
+ perror(err_msg);
777
+ fprintf(stderr, "Skipping file: %s\n", current_file_name);
778
+ continue;
779
+ }
780
+
781
+ printf("File Number: %d \n", fileNameIndex);
782
+
783
+ /* Process it as below*/
784
+ // it modifies the data structures passed to it as arguments, processes the
785
+ // input file, and updates global state
786
+ process_one_file(buffer_content, /* int */
787
+ tokens, /* char** */
788
+ pars, /* int* */
789
+ par_len, /* int* */
790
+ bad_par, /* char* */
791
+
792
+ docs, /* int* */
793
+
794
+ doc_tag, /* char* */
795
+ doc_tag_len, /* int */
796
+ doc_end_tag, /* char* */
797
+ doc_end_tag_len, /* int */
798
+ par_tag, /* char* */
799
+ par_tag_len, /* int */
800
+ par_end_tag, /* char* */
801
+ par_end_tag_len, /* int */
802
+
803
+ have_dupl_ngrams, /* long int int */
804
+ total_processed_bytes, /* unsigned */
805
+ // make sure these become mutable
806
+ global, /* ngrhash */
807
+ local, /* ngrhash */
808
+ bh_buffer, /* buzhash_buffer_t */
809
+ buffer /* char* */
810
+ );
811
+
812
+ if (DuplicateFilenames.size() > 3000) {
813
+ fileIndex++;
814
+ std::string filename =
815
+ std::string(output_dir) + "/" + "Duplicate_pair_files_" +
816
+ std::string(datasetname) + "_" + to_string(fileIndex) + ".csv";
817
+
818
+ writeFilenameDuplicateToCSV(DuplicateFilenames, filename);
819
+
820
+ std::string filename_scores =
821
+ std::string(output_dir) + "/" + "Duplicate_scores_" +
822
+ std::string(datasetname) + "_" + to_string(fileIndex) + ".csv";
823
+
824
+ writeFilenameDuplicateScoreToCSV(DuplicateFilenamesScores,
825
+ filename_scores);
826
+ DuplicateFilenames.clear();
827
+ DuplicateFilenamesScores.clear();
828
+ printf(" 3000 Duplicated files are saved\n");
829
+ }
830
+
831
+ fclose(Input);
832
+ }
833
+
834
+ // write the remaining duplicate files
835
+ fileIndex++;
836
+ std::string filename = std::string(output_dir) + "/" +
837
+ "Duplicate_pair_files_" + std::string(datasetname) +
838
+ "_" + to_string(fileIndex) + ".csv";
839
+ writeFilenameDuplicateToCSV(DuplicateFilenames, filename);
840
+ std::string name = std::string(output_dir) + "/" + "Unchecked_files_" +
841
+ std::string(datasetname) + "_" + ".csv";
842
+ writeUncheckedFilenamesToCSV(UncheckedFilenames, name);
843
+
844
+ // print progress information
845
+ total_processed_bytes += buffer_size;
846
+ if (!Quiet)
847
+ print_progress("removing duplicates", total_processed_bytes, 100);
848
+
849
+ // save the global hash map
850
+ // saveGlobalHashmap(global); // This will append to the existing file or
851
+ // create a new one if not present
852
+
853
+ return 0;
854
+ }