dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,799 @@
1
+ /*********************************************************************
2
+ * Copyright (c) 2011-2020 Jan Pomikalek, Milos Jakubicek *
3
+ * All rights reserved. *
4
+ * *
5
+ * This software is licensed as described in the file COPYING, which *
6
+ * you should have received as part of this distribution. *
7
+ *********************************************************************
8
+ * *
9
+ * Refactored in 2023 by Sally Choker and Fadi Zaraket. *
10
+ * Modifications: *
11
+ * - Adapted to take as input a list of file paths instead of *
12
+ * a single file. *
13
+ * - Enhanced to detect the source of duplicate files. *
14
+ *********************************************************************/
15
+
16
+ #include "buzhash.h"
17
+ #include "version.h"
18
+ #include <algorithm>
19
+ #include <errno.h>
20
+ #include <fstream>
21
+ #include <iostream>
22
+ #include <sstream>
23
+ #include <stdio.h>
24
+ #include <string.h>
25
+ #include <sys/resource.h>
26
+ #include <sys/stat.h>
27
+ #include <sys/time.h>
28
+ #include <time.h>
29
+ #include <unistd.h>
30
+ #include <unordered_map>
31
+ #include <vector>
32
+
33
+ #define GOOGLE_SPARSE
34
+
35
+ #if defined GOOGLE_SPARSE
36
+ #include <sparsehash/sparse_hash_map>
37
+ // #include <sparsehash/sparse_hash_set>
38
+ // using google::sparse_hash_set;
39
+ #elif defined __GNUC__ || defined __APPLE__
40
+ #include <ext/hash_map>
41
+ namespace std { using namespace __gnu_cxx; }
42
+ #else
43
+ #include <hash_map>
44
+ #endif
45
+ using namespace std;
46
+
47
+ #ifdef GOOGLE_SPARSE
48
+ // typedef sparse_hash_set<uint64_t> ngrhash;
49
+ typedef google::sparse_hash_map<uint64_t, int> ngrhash;
50
+ #else
51
+ // typedef hash_map<uint64_t,bool> ngrhash;
52
+ typedef hash_map<uint64_t, int> ngrhash;
53
+ #endif
54
+
55
+ struct DupVal {
56
+ string original_f_name; // extracted from the global map
57
+ string dup_f_name; // the current file name
58
+ int matches;
59
+ };
60
+
61
+ // initialize vector to store pair filenames and duplicates of all files
62
+ std::vector<std::pair<string, int>> foundFilenames;
63
+
64
+ // initailize vector to store only pairs of duplicated filenames
65
+ std::vector<std::pair<string, int>> DuplicateFilenames;
66
+ // initialize a vector to store documents not checked for deduplication (don't
67
+ // fit in the buffer)
68
+ std::vector<int> UncheckedFilenames;
69
+
70
+ #define BITMASK_HIGH63 0xfffffffffffffffeul
71
+
72
+ #define NGRAM_SIZE 7
73
+ #define DUPL_THRES 0.6
74
+ #define DOC_TAG "doc"
75
+ #define PAR_TAG "p"
76
+ #define TRIM_HASHES 64
77
+ #define MAX_STUB_LENGTH 20
78
+ #define BUFFER_SIZE 335544320
79
+
80
+ // options
81
+ int Ngram_size = NGRAM_SIZE;
82
+ float Dupl_thres = DUPL_THRES;
83
+ const char *Doc_tag = DOC_TAG;
84
+ const char *Par_tag = PAR_TAG;
85
+ int Strip_dupl = 0;
86
+ int No_smoothing = 0;
87
+ int Trim_hashes = TRIM_HASHES;
88
+ int Max_stub_length = MAX_STUB_LENGTH;
89
+ long Buffer_size = BUFFER_SIZE;
90
+ int Quiet = 0;
91
+ char *Dupl_hashes_path = NULL;
92
+ FILE *Input;
93
+ long int Input_size;
94
+ char current_file_name[2048];
95
+ int fileNameIndex = 0;
96
+
97
+ void print_usage(FILE *stream) {
98
+ fprintf(stream, "\
99
+ Usage: onion [OPTIONS] [FILE]\n\
100
+ Mark duplicate text parts in the input vertical file.\n\
101
+ \n\
102
+ -f FILE hashes of duplicate n-grams\n\
103
+ -n NUM n-gram length (default: %i)\n\
104
+ -t NUM duplicate content threshold (default: %.1f)\n\
105
+ -d STR document tag (default: %s)\n\
106
+ -p STR paragraph tag (default: %s)\n\
107
+ -s strip duplicate parts (rather than mark)\n\
108
+ -m no smoothing\n\
109
+ -T NUM trim n-gram hashes to NUM bits (default: %i)\n\
110
+ -l NUM max stub length (default: %i)\n\
111
+ -b NUM buffer size, in bytes (default: %i)\n\
112
+ -q quiet; suppress all output except for errors\n\
113
+ \n\
114
+ -V print version information and exit\n\
115
+ -h display this help and exit\n\
116
+ \n\
117
+ With no FILE, or when FILE is -, read standard input.\n\
118
+ Output is written to standard output.\n\
119
+ \n\
120
+ Project home page: <http://code.google.com/p/onion/>\n",
121
+ NGRAM_SIZE, DUPL_THRES, DOC_TAG, PAR_TAG, TRIM_HASHES,
122
+ MAX_STUB_LENGTH, BUFFER_SIZE);
123
+ }
124
+
125
+ void print_progress(const char *task_descr, unsigned long int processed_bytes,
126
+ float percent_done) {
127
+ time_t now;
128
+ struct rusage usage;
129
+ time(&now);
130
+ getrusage(RUSAGE_SELF, &usage);
131
+ fprintf(stderr, "[%.24s] onion: %s: %6li MB processed", ctime(&now),
132
+ task_descr, processed_bytes / (1024 * 1024));
133
+ if (percent_done >= 0)
134
+ fprintf(stderr, " (%6.2f%%)", percent_done);
135
+ fprintf(stderr, "\t%6li MB RAM used", usage.ru_maxrss / 1024);
136
+ fprintf(stderr, "\n");
137
+ }
138
+
139
+ void saveGlobalHashmap(const ngrhash &global) {
140
+ std::ofstream outFile("path", std::ios::binary | std::ios::app);
141
+ if (!outFile.is_open()) {
142
+ throw std::runtime_error("Failed to open file for writing");
143
+ }
144
+
145
+ for (const auto &pair : global) {
146
+ outFile.write(reinterpret_cast<const char *>(&pair.first),
147
+ sizeof(pair.first));
148
+ outFile.write(reinterpret_cast<const char *>(&pair.second),
149
+ sizeof(pair.second));
150
+ }
151
+
152
+ outFile.close();
153
+ }
154
+
155
+ bool fileExists(const std::string &name) {
156
+ struct stat map;
157
+ return (stat(name.c_str(), &map) == 0);
158
+ }
159
+
160
+ void writeFilenameDuplicateToCSV(
161
+ const std::vector<std::pair<std::string, int>> &vec,
162
+ const std::string &fullPath) {
163
+ std::ifstream infile(fullPath);
164
+ bool fileExists = infile.is_open();
165
+ infile.close();
166
+
167
+ std::ofstream file(fullPath, fileExists ? std::ios::app : std::ios::out);
168
+ if (!file.is_open()) {
169
+ std::cerr << "Failed to open the file: " << fullPath << std::endl;
170
+ return;
171
+ }
172
+
173
+ for (const auto &pair : vec) {
174
+ file << "\"" << pair.first << "\"," << pair.second << "\n";
175
+ }
176
+
177
+ file.close();
178
+ }
179
+
180
+ void writeUncheckedFilenamesToCSV(const std::vector<int> &vec,
181
+ const std::string &fullPath) {
182
+ std::ifstream infile(fullPath);
183
+ bool fileExists = infile.is_open();
184
+ infile.close();
185
+
186
+ std::ofstream file(fullPath, fileExists ? std::ios::app : std::ios::out);
187
+ if (!file.is_open()) {
188
+ std::cerr << "Failed to open the file: " << fullPath << std::endl;
189
+ return;
190
+ }
191
+
192
+ for (const auto &num : vec) {
193
+ file << num << "\n";
194
+ }
195
+
196
+ file.close();
197
+ }
198
+
199
+ int process_one_par(int *&pars, char **&tokens, buzhash_buffer_t &bh_buffer,
200
+ hash_t &hash_bitmask, ngrhash &local, ngrhash &global,
201
+ int &bad_tokens, int &tok_i, int &par_i, int &total_tokens,
202
+ int &prev_bad_tokens, int &have_dupl_ngrams) {
203
+ ngrhash::const_iterator it;
204
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
205
+ char *token = tokens[tok_i];
206
+ if (token[0] == '<')
207
+ continue;
208
+ total_tokens++;
209
+ prev_bad_tokens--;
210
+ if (prev_bad_tokens < 0)
211
+ prev_bad_tokens = 0;
212
+ hash_t hash = buzhash(token, &bh_buffer);
213
+ hash_t masked_hash = hash & hash_bitmask;
214
+ if (!buzhash_is_full_buffer(&bh_buffer))
215
+ continue;
216
+ it = local.find(hash);
217
+ if (it == local.end()) {
218
+ if (have_dupl_ngrams) {
219
+ // test with the last bit set to 1
220
+ // // (check against already seen duplicate ngrams)
221
+ it = global.find(masked_hash | 1);
222
+ } else {
223
+ it = global.find(masked_hash);
224
+ }
225
+ }
226
+ if (it != global.end()) {
227
+ bad_tokens += Ngram_size - prev_bad_tokens;
228
+ prev_bad_tokens = Ngram_size;
229
+ if (fileNameIndex != it->second) {
230
+ foundFilenames.push_back(make_pair(current_file_name, it->second));
231
+ }
232
+ }
233
+ #ifdef GOOGLE_SPARSE
234
+ local.insert(std::make_pair(hash, fileNameIndex));
235
+ #else
236
+ local[hash] = fileNameIndex;
237
+ #endif
238
+ }
239
+ }
240
+
241
+ int process_one_file(int &buffer_content, char **&tokens, int *&pars,
242
+ int *&par_len, char *&bad_par,
243
+
244
+ int *docs,
245
+
246
+ char *doc_tag, int doc_tag_len, char *doc_end_tag,
247
+ int doc_end_tag_len, char *par_tag, int par_tag_len,
248
+ char *par_end_tag, int par_end_tag_len,
249
+
250
+ int &have_dupl_ngrams,
251
+ unsigned long int &total_processed_bytes,
252
+
253
+ // make sure these become mutable
254
+ ngrhash &global, ngrhash &local,
255
+ buzhash_buffer_t &bh_buffer, char *buffer) {
256
+ int bytes_read = fread(buffer + buffer_content, sizeof(char),
257
+ Buffer_size - buffer_content, Input);
258
+ // print("Buffer %x, BufferContent %d, BufferSize %ld, \n", buffer,
259
+ // buffer_content, Buffer_size);
260
+ hash_t hash_bitmask = 0xfffffffffffffffful;
261
+
262
+ int buffer_size = buffer_content + bytes_read;
263
+ buffer[buffer_size] = '\0'; // make it a string
264
+ char *buffer_pos = buffer;
265
+
266
+ // find tokens
267
+ int token_count = 0;
268
+ tokens[token_count++] = buffer_pos++;
269
+ while ((buffer_pos = strchr(buffer_pos, '\n')) != NULL) {
270
+ buffer_pos[0] = '\0';
271
+ tokens[token_count++] = ++buffer_pos;
272
+ }
273
+
274
+ // find docs and paragraphs
275
+ int doc_count = 0;
276
+ int par_count = 0;
277
+ docs[doc_count++] = 0;
278
+ pars[par_count++] = 0;
279
+ int start_doc_next = 0;
280
+ int start_par_next = 0;
281
+ int i;
282
+
283
+ for (i = 1; i < token_count; i++) {
284
+ // "<doc>" or "<doc "
285
+ if (start_doc_next ||
286
+ (strncmp(tokens[i], doc_tag, doc_tag_len) == 0 &&
287
+ (tokens[i][doc_tag_len] == ' ' || tokens[i][doc_tag_len] == '>'))) {
288
+ docs[doc_count++] = par_count;
289
+ pars[par_count++] = i;
290
+ start_doc_next = 0;
291
+ }
292
+ // "</doc>"
293
+ else if (strncmp(tokens[i], doc_end_tag, doc_end_tag_len) == 0) {
294
+ start_doc_next = 1;
295
+ }
296
+ // "<p>" or "<p "
297
+ else if (start_par_next || (strncmp(tokens[i], par_tag, par_tag_len) == 0 &&
298
+ (tokens[i][par_tag_len] == ' ' ||
299
+ tokens[i][par_tag_len] == '>'))) {
300
+ pars[par_count++] = i;
301
+ start_par_next = 0;
302
+ }
303
+ // "</p>"
304
+ else if (strncmp(tokens[i], par_end_tag, par_end_tag_len) == 0) {
305
+ start_par_next = 1;
306
+ }
307
+ }
308
+
309
+ if (doc_count == 1 && !feof(Input)) {
310
+ // full buffer contains only one document
311
+ // by default, documents with higher than 16MB are not checked for dedup
312
+ fprintf(stderr, "Too long document at byte %li.\n", total_processed_bytes);
313
+ UncheckedFilenames.push_back(fileNameIndex);
314
+ return 1;
315
+ }
316
+
317
+ if (feof(Input)) {
318
+ // create sentinels
319
+ docs[doc_count++] = par_count;
320
+ if (strlen(tokens[token_count - 1]) == 0) {
321
+ // files ending with a newline have a natural sentinel
322
+ // (the last zero-length token)
323
+ pars[par_count++] = token_count - 1;
324
+ } else {
325
+ // for other files, we need to make up the last token
326
+ pars[par_count++] = token_count;
327
+ tokens[token_count++] = buffer + buffer_size;
328
+ }
329
+ }
330
+
331
+ // for all documents
332
+ int doc_i;
333
+ for (doc_i = 0; doc_i < doc_count - 1; doc_i++) {
334
+ buzhash_clear_buffer(&bh_buffer);
335
+ local.clear();
336
+ // for all paragraphs in the document
337
+ int par_i;
338
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
339
+ int total_tokens = 0;
340
+ int bad_tokens = 0;
341
+ /* prev_bad_tokens is the number of tokens in the current
342
+ * n-gram which are contained in one of the previous bad
343
+ * n-grams.
344
+ *
345
+ * At the beginning of a new paragraph we need to pretend that
346
+ * there are Ngram_size prev_bad_tokens so that the leading
347
+ * bad n-grams do not generate too many bod tokens. */
348
+ int prev_bad_tokens = Ngram_size;
349
+ // for all tokens in the paragraph
350
+ // for every token in the current paragraph, it computes its hash and
351
+ // checks if it is present in the local and global hash maps
352
+ int tok_i;
353
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
354
+ process_one_par(pars, tokens, bh_buffer, hash_bitmask, local, global,
355
+ bad_tokens, tok_i, par_i, total_tokens, prev_bad_tokens,
356
+ have_dupl_ngrams);
357
+ }
358
+
359
+ // remember the length of the paragraph
360
+ par_len[par_i] = total_tokens;
361
+
362
+ // mark bad paragraphs
363
+ bad_par[par_i] =
364
+ (total_tokens > 0 && (1.0 * bad_tokens / total_tokens) > Dupl_thres);
365
+ }
366
+
367
+ // smoothing
368
+ if (!No_smoothing) {
369
+ int last_bad_par = docs[doc_i] - 1;
370
+ int stub_length = 0;
371
+ for (par_i = docs[doc_i]; par_i <= docs[doc_i + 1]; par_i++) {
372
+ if (par_i == docs[doc_i + 1] || bad_par[par_i]) {
373
+ if (stub_length <= Max_stub_length) {
374
+ // remove stub
375
+ int par_j;
376
+ for (par_j = last_bad_par + 1; par_j < par_i; par_j++)
377
+ bad_par[par_j] = 1;
378
+ }
379
+ last_bad_par = par_i;
380
+ stub_length = 0;
381
+ } else {
382
+ stub_length += par_len[par_i];
383
+ }
384
+ }
385
+ }
386
+
387
+ int count_bad = 0;
388
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
389
+ if (!bad_par[par_i] && par_len[par_i] > 0) {
390
+ } else {
391
+ count_bad += 1;
392
+ }
393
+ }
394
+ if (count_bad > 0) {
395
+ std::sort(foundFilenames.begin(), foundFilenames.end());
396
+ auto uniqueEnd =
397
+ std::unique(foundFilenames.begin(), foundFilenames.end());
398
+ foundFilenames.erase(uniqueEnd, foundFilenames.end());
399
+ printf("%s is %d bad\n", current_file_name, count_bad);
400
+ DuplicateFilenames.insert(DuplicateFilenames.end(),
401
+ foundFilenames.begin(), foundFilenames.end());
402
+ // DuplicateFilenames.push_back(current_file_name);
403
+ }
404
+
405
+ foundFilenames.clear();
406
+
407
+ // is there at least one good paragraph?
408
+ int all_bad = 1;
409
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
410
+ if (!bad_par[par_i] && par_len[par_i] > 0) {
411
+ all_bad = 0;
412
+ break;
413
+ }
414
+ }
415
+
416
+ buzhash_clear_buffer(&bh_buffer);
417
+ // for all paragraphs in the document (again)
418
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
419
+ int first_token = pars[docs[doc_i]];
420
+ int last_token = pars[docs[doc_i + 1]] - 1;
421
+ // for all tokens in the paragraph
422
+ int tok_i;
423
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
424
+ char *token = tokens[tok_i];
425
+ int bad_token = bad_par[par_i];
426
+ // tags at document boundaries are preserved if there is
427
+ // at least one good paragraph, removed otherwise
428
+ if (tok_i == first_token && strncmp(token, doc_tag, doc_tag_len) == 0)
429
+ bad_token = all_bad;
430
+ if (tok_i == last_token && strcmp(token, doc_end_tag) == 0)
431
+ bad_token = all_bad;
432
+ if (token[0] == '<')
433
+ continue;
434
+ // store hashes of n-grams
435
+ hash_t hash = buzhash(token, &bh_buffer);
436
+ hash_t masked_hash = hash & hash_bitmask;
437
+ if (!buzhash_is_full_buffer(&bh_buffer))
438
+ continue;
439
+ if (!bad_par[par_i]) {
440
+ if (have_dupl_ngrams) {
441
+ // If we have the list of hashes of all duplicate
442
+ // n-grams, we set the least significant bit of the
443
+ // stored hash to 1 if we have seen the matching
444
+ // duplicate n-gram to indicate it has been seen.
445
+ // Unique n-grams are ignored.
446
+ if (global.erase(masked_hash & BITMASK_HIGH63))
447
+ #ifdef GOOGLE_SPARSE
448
+ global.insert(std::make_pair(masked_hash | 1, fileNameIndex));
449
+
450
+ #else
451
+ global[masked_hash | 1] = fileNameIndex;
452
+ #endif
453
+ } else {
454
+ // otherwise we have to store hashes of all n-grams
455
+ #ifdef GOOGLE_SPARSE
456
+ global.insert(std::make_pair(masked_hash, fileNameIndex));
457
+ #else
458
+ global[masked_hash] = fileNameIndex;
459
+ #endif
460
+ }
461
+ }
462
+ }
463
+ }
464
+ }
465
+
466
+ // copy the unprocessed data to the beginning of the buffer
467
+ if (!feof(Input)) {
468
+ char *last_doc_fst_tok = tokens[pars[docs[doc_count - 1]]];
469
+ int processed_bytes = last_doc_fst_tok - buffer;
470
+ total_processed_bytes += processed_bytes;
471
+ int remaining_bytes = buffer_size - processed_bytes;
472
+ char *buffer_end = last_doc_fst_tok + remaining_bytes;
473
+ char *pos;
474
+ // replace \0s with EOLs (revert the buffer contents to original)
475
+ for (pos = last_doc_fst_tok; pos < buffer_end; pos++)
476
+ if (pos[0] == '\0')
477
+ pos[0] = '\n';
478
+ // print progress information
479
+ if (!Quiet) {
480
+ float percent_done = -1;
481
+ if (Input_size > 0)
482
+ percent_done = 100.0 * total_processed_bytes / Input_size;
483
+ print_progress("removing duplicates", total_processed_bytes,
484
+ percent_done);
485
+ }
486
+ memmove(buffer, last_doc_fst_tok, remaining_bytes);
487
+ buffer_content = remaining_bytes;
488
+ }
489
+ printf("end of process_one_file\n");
490
+ }
491
+
492
+ int main(int argc, char **argv) {
493
+ printf("Dupl Threshold %f \n", Dupl_thres);
494
+ printf("N-gram size %d \n", Ngram_size);
495
+ // get options
496
+ int c;
497
+ char *endptr;
498
+ char *datasetname = NULL;
499
+ char *listOfFilesPath = NULL;
500
+ char *output_dir = NULL;
501
+
502
+ while ((c = getopt(argc, argv, "f:n:t:d:p:smT:l:b:qVhD:L:O:")) != -1) {
503
+ errno = 0;
504
+ switch (c) {
505
+ case 'f':
506
+ Dupl_hashes_path = optarg;
507
+ break;
508
+ case 'n':
509
+ Ngram_size = strtol(optarg, &endptr, 10);
510
+ if (errno != 0 || *endptr != '\0') {
511
+ fprintf(stderr, "Integer value expected for -n, got: %s\n", optarg);
512
+ print_usage(stderr);
513
+ return 1;
514
+ }
515
+ break;
516
+ case 't':
517
+ Dupl_thres = strtod(optarg, &endptr);
518
+ if (errno != 0 || *endptr != '\0') {
519
+ fprintf(stderr, "Float value expected for -t, got: %s\n", optarg);
520
+ print_usage(stderr);
521
+ return 1;
522
+ }
523
+ break;
524
+ case 'd':
525
+ Doc_tag = optarg;
526
+ break;
527
+ case 'p':
528
+ Par_tag = optarg;
529
+ break;
530
+ case 's':
531
+ Strip_dupl = 1;
532
+ break;
533
+ case 'm':
534
+ No_smoothing = 1;
535
+ break;
536
+ case 'T':
537
+ Trim_hashes = strtol(optarg, &endptr, 10);
538
+ if (errno != 0 || *endptr != '\0') {
539
+ fprintf(stderr, "Integer value expected for -T, got: %s\n", optarg);
540
+ print_usage(stderr);
541
+ return 1;
542
+ }
543
+ break;
544
+ case 'l':
545
+ Max_stub_length = strtol(optarg, &endptr, 10);
546
+ if (errno != 0 || *endptr != '\0') {
547
+ fprintf(stderr, "Integer value expected for -l, got: %s\n", optarg);
548
+ print_usage(stderr);
549
+ return 1;
550
+ }
551
+ break;
552
+ case 'b':
553
+ Buffer_size = strtol(optarg, &endptr, 10);
554
+ if (errno != 0 || *endptr != '\0') {
555
+ fprintf(stderr, "Integer value expected for -b, got: %s\n", optarg);
556
+ print_usage(stderr);
557
+ return 1;
558
+ }
559
+ break;
560
+ case 'q':
561
+ Quiet = 1;
562
+ break;
563
+ case 'V':
564
+ print_version("onion");
565
+ return 0;
566
+ case 'h':
567
+ print_usage(stdout);
568
+ return 0;
569
+ case '?':
570
+ print_usage(stderr);
571
+ return 1;
572
+
573
+ case 'D':
574
+ datasetname = optarg;
575
+ break;
576
+ case 'L':
577
+ listOfFilesPath = optarg;
578
+ break;
579
+ case 'O':
580
+ output_dir = optarg;
581
+ break;
582
+ }
583
+ }
584
+
585
+ if (output_dir == NULL || datasetname == NULL || listOfFilesPath == NULL) {
586
+ std::cerr << "Missing required parameters.\n";
587
+ return 1;
588
+ }
589
+
590
+ FILE *List_of_Files = fopen(listOfFilesPath, "r");
591
+ if (List_of_Files == NULL) {
592
+ perror("Unable to open list of files");
593
+ return 1;
594
+ }
595
+
596
+ struct stat st = {0};
597
+ if (stat(output_dir, &st) == -1) {
598
+ if (mkdir(output_dir, 0700) != 0) {
599
+ perror("Failed to create output directory");
600
+ return 1;
601
+ }
602
+ }
603
+
604
+ Input = stdin;
605
+ Input_size = -1;
606
+ if (optind < argc) {
607
+ char *filename = argv[optind];
608
+ if (strcmp(filename, "-") != 0) {
609
+ errno = 0;
610
+ Input = fopen(filename, "r");
611
+ if (errno != 0) {
612
+ fprintf(stderr, "Unable to open %s for reading.\n", filename);
613
+ return 1;
614
+ }
615
+ fseek(Input, 0L, SEEK_END);
616
+ Input_size = ftell(Input);
617
+ fseek(Input, 0L, SEEK_SET);
618
+ }
619
+ }
620
+
621
+ unsigned long int total_processed_bytes = 0;
622
+
623
+ // patterns
624
+ char *doc_tag = (char *)malloc((strlen(Doc_tag) + 1 + 1) * sizeof(char));
625
+ strcat(strcpy(doc_tag, "<"), Doc_tag);
626
+ char *doc_end_tag = (char *)malloc((strlen(Doc_tag) + 3 + 1) * sizeof(char));
627
+ strcat(strcat(strcpy(doc_end_tag, "</"), Doc_tag), ">");
628
+ char *par_tag = (char *)malloc((strlen(Par_tag) + 1 + 1) * sizeof(char));
629
+ strcat(strcpy(par_tag, "<"), Par_tag);
630
+ char *par_end_tag = (char *)malloc((strlen(Par_tag) + 3 + 1) * sizeof(char));
631
+ strcat(strcat(strcpy(par_end_tag, "</"), Par_tag), ">");
632
+
633
+ int doc_tag_len = strlen(doc_tag);
634
+ int doc_end_tag_len = strlen(doc_end_tag);
635
+ int par_tag_len = strlen(par_tag);
636
+ int par_end_tag_len = strlen(par_end_tag);
637
+
638
+ // bitmask for trimming ngram hashes
639
+ hash_t hash_bitmask = 0xfffffffffffffffful;
640
+ int bitshift = 64 - Trim_hashes;
641
+ if (bitshift > 0)
642
+ hash_bitmask >>= bitshift;
643
+
644
+ // data structures
645
+ int buffer_size = 0;
646
+ int buffer_content = 0;
647
+ char *buffer = (char *)malloc((Buffer_size + 1) * sizeof(char));
648
+ char **tokens = (char **)malloc((Buffer_size + 1) * sizeof(char *));
649
+ int *pars = (int *)malloc((Buffer_size + 1) *
650
+ sizeof(int)); // array of starting tokens
651
+ int *par_len = (int *)malloc((Buffer_size + 1) * sizeof(int));
652
+ char *bad_par = (char *)malloc((Buffer_size + 1) * sizeof(char));
653
+ int *docs =
654
+ (int *)malloc((Buffer_size + 1) * sizeof(int)); // array of starting pars
655
+ int token_count, par_count, doc_count;
656
+
657
+ // buzhash
658
+ buzhash_buffer_t bh_buffer;
659
+ buzhash_init_buffer(&bh_buffer, Ngram_size);
660
+
661
+ // global hash table stores the hashes of all files read so far (in our case
662
+ // the full document since we did not mark paragraph separators) local hash
663
+ // table stores the hahses of n-grams found within the currently processed
664
+ // file (doc as a whole in our case)
665
+ ngrhash global, local;
666
+ #ifdef GOOGLE_SPARSE
667
+ global.set_deleted_key(0);
668
+ local.set_deleted_key(0);
669
+ #endif
670
+
671
+ // read hashes of duplicate n-grams if available
672
+ int have_dupl_ngrams = 0;
673
+ if (Dupl_hashes_path != NULL) {
674
+ have_dupl_ngrams = 1;
675
+ errno = 0;
676
+ FILE *ngrams_fp = fopen(Dupl_hashes_path, "r");
677
+ if (errno != 0) {
678
+ fprintf(stderr, "Unable to open %s for reading.\n", Dupl_hashes_path);
679
+ return 1;
680
+ }
681
+ fseek(ngrams_fp, 0L, SEEK_END);
682
+ unsigned long int ngrams_size = ftell(ngrams_fp);
683
+ fseek(ngrams_fp, 0L, SEEK_SET);
684
+
685
+ unsigned long int bytes_read = 0;
686
+ hash_t hash;
687
+ while (fread(&hash, sizeof(hash), 1, ngrams_fp)) {
688
+ printf("reading");
689
+ bytes_read += sizeof(hash);
690
+ hash_t masked_hash = hash & hash_bitmask;
691
+ // store only the 63 most significant bits of the hash;
692
+ // reserve the last bit as a flag (seen / unseen)
693
+ // #ifdef GOOGLE_SPARSE
694
+ // global.insert(std::make_pair(masked_hash | 1,
695
+ // fileNameIndex));//global.insert (masked_hash & BITMASK_HIGH63); #else
696
+ // global[masked_hash & BITMASK_HIGH63] = true;
697
+ // #endif
698
+
699
+ // print progress information
700
+ if (!Quiet && bytes_read % (10000000 * sizeof(hash)) == 0) {
701
+ float percent_done = -1;
702
+ if (ngrams_size > 0)
703
+ percent_done = 100.0 * bytes_read / ngrams_size;
704
+ print_progress("reading hashes", bytes_read, percent_done);
705
+ }
706
+ }
707
+
708
+ if (!Quiet)
709
+ print_progress("reading hashes", bytes_read, 100);
710
+ }
711
+
712
+ int fileIndex =
713
+ 0; // File index t store every 1000 duplicates in a new csv file
714
+
715
+ while (!feof(List_of_Files)) {
716
+ fileNameIndex++;
717
+ /* read the name of each file in the list of files*/
718
+ if (fgets(current_file_name, sizeof(current_file_name), List_of_Files) ==
719
+ NULL) {
720
+ break;
721
+ }
722
+ int last_ch = current_file_name[strlen(current_file_name) - 1];
723
+ if (last_ch == '\n') {
724
+ current_file_name[strlen(current_file_name) - 1] = '\0';
725
+ }
726
+ /* use fopen to assign Input to it*/
727
+ Input = fopen(current_file_name, "r");
728
+ if (Input == NULL) {
729
+ char err_msg[1024];
730
+ snprintf(err_msg, 1023, "Error to open data file %s--",
731
+ current_file_name);
732
+ perror(err_msg);
733
+ continue;
734
+ }
735
+
736
+ printf("File Number: %d \n", fileNameIndex);
737
+
738
+ /* Process it as below*/
739
+ // it modifies the data structures passed to it as arguments, processes the
740
+ // input file, and updates global state
741
+ process_one_file(buffer_content, /* int */
742
+ tokens, /* char** */
743
+ pars, /* int* */
744
+ par_len, /* int* */
745
+ bad_par, /* char* */
746
+
747
+ docs, /* int* */
748
+
749
+ doc_tag, /* char* */
750
+ doc_tag_len, /* int */
751
+ doc_end_tag, /* char* */
752
+ doc_end_tag_len, /* int */
753
+ par_tag, /* char* */
754
+ par_tag_len, /* int */
755
+ par_end_tag, /* char* */
756
+ par_end_tag_len, /* int */
757
+
758
+ have_dupl_ngrams, /* long int int */
759
+ total_processed_bytes, /* unsigned */
760
+ // make sure these become mutable
761
+ global, /* ngrhash */
762
+ local, /* ngrhash */
763
+ bh_buffer, /* buzhash_buffer_t */
764
+ buffer /* char* */
765
+ );
766
+
767
+ if (DuplicateFilenames.size() > 1000) {
768
+ fileIndex++;
769
+ std::string filename = std::string(output_dir) + "Duplicate_pair_files_" +
770
+ std::string(datasetname) + "_" +
771
+ to_string(fileIndex) + ".csv";
772
+ writeFilenameDuplicateToCSV(DuplicateFilenames, filename);
773
+ DuplicateFilenames.clear();
774
+ printf(" 1000 Duplicated files are saved\n");
775
+ }
776
+
777
+ fclose(Input);
778
+ }
779
+
780
+ fileIndex++;
781
+ std::string filename = std::string(output_dir) + "Duplicate_pair_files_" +
782
+ std::string(datasetname) + "_" + to_string(fileIndex) +
783
+ ".csv";
784
+ writeFilenameDuplicateToCSV(DuplicateFilenames, filename);
785
+ std::string name = std::string(output_dir) + "Unchecked_files_" +
786
+ std::string(datasetname) + "_" + ".csv";
787
+ writeUncheckedFilenamesToCSV(UncheckedFilenames, name);
788
+
789
+ // print progress information
790
+ total_processed_bytes += buffer_size;
791
+ if (!Quiet)
792
+ print_progress("removing duplicates", total_processed_bytes, 100);
793
+
794
+ // save the global hash map
795
+ // saveGlobalHashmap(global); // This will append to the existing file or
796
+ // create a new one if not present
797
+
798
+ return 0;
799
+ }