dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,824 @@
1
+ /*********************************************************************
2
+ * Copyright (c) 2011-2020 Jan Pomikalek, Milos Jakubicek *
3
+ * All rights reserved. *
4
+ * *
5
+ * This software is licensed as described in the file COPYING, which *
6
+ * you should have received as part of this distribution. *
7
+ *********************************************************************/
8
+
9
+ #include "buzhash.h"
10
+ #include "version.h"
11
+ #include <algorithm>
12
+ #include <errno.h>
13
+ #include <fstream>
14
+ #include <iostream>
15
+ #include <sstream>
16
+ #include <stdio.h>
17
+ #include <string.h>
18
+ #include <sys/resource.h>
19
+ #include <sys/stat.h>
20
+ #include <sys/time.h>
21
+ #include <time.h>
22
+ #include <unistd.h>
23
+ #include <unordered_map>
24
+ #include <vector>
25
+
26
+ #define GOOGLE_SPARSE
27
+
28
+ #if defined GOOGLE_SPARSE
29
+ #include <sparsehash/sparse_hash_map>
30
+ // #include <sparsehash/sparse_hash_set>
31
+ // using google::sparse_hash_set;
32
+ #elif defined __GNUC__ || defined __APPLE__
33
+ #include <ext/hash_map>
34
+ namespace std { using namespace __gnu_cxx; }
35
+ #else
36
+ #include <hash_map>
37
+ #endif
38
+ using namespace std;
39
+
40
+ #ifdef GOOGLE_SPARSE
41
+ // typedef sparse_hash_set<uint64_t> ngrhash;
42
+ typedef google::sparse_hash_map<uint64_t, int> ngrhash;
43
+ #else
44
+ // typedef hash_map<uint64_t,bool> ngrhash;
45
+ typedef hash_map<uint64_t, int> ngrhash;
46
+ #endif
47
+
48
+ // initialize vector to store pair filenames and duplicates of all files
49
+ std::vector<std::pair<string, int>> foundFilenames;
50
+
51
+ // initailize vector to store only pairs of duplicated filenames
52
+ std::vector<std::pair<string, int>> DuplicateFilenames;
53
+ // initialize a vector to store documents not checked for deduplication (don't
54
+ // fit in the buffer)
55
+ std::vector<int> UncheckedFilenames;
56
+
57
+ // initialize a vector to stroe duplicate filenames and their score:
58
+ // bad_tokens/total_tokens
59
+ std::vector<std::pair<string, float>> DuplicateFilenamesScores;
60
+
61
+ #define BITMASK_HIGH63 0xfffffffffffffffeul
62
+
63
+ #define NGRAM_SIZE 20
64
+ #define DUPL_THRES 0.9
65
+ #define DOC_TAG "doc"
66
+ #define PAR_TAG "p"
67
+ #define TRIM_HASHES 64
68
+ #define MAX_STUB_LENGTH 20
69
+ #define BUFFER_SIZE 335544320
70
+
71
+ // options
72
+ int Ngram_size = NGRAM_SIZE;
73
+ float Dupl_thres = DUPL_THRES;
74
+ const char *Doc_tag = DOC_TAG;
75
+ const char *Par_tag = PAR_TAG;
76
+ int Strip_dupl = 0;
77
+ int No_smoothing = 0;
78
+ int Trim_hashes = TRIM_HASHES;
79
+ int Max_stub_length = MAX_STUB_LENGTH;
80
+ long Buffer_size = BUFFER_SIZE;
81
+ int Quiet = 0;
82
+ char *Dupl_hashes_path = NULL;
83
+ FILE *Input;
84
+ long int Input_size;
85
+ char current_file_name[2048];
86
+ int fileNameIndex = 0;
87
+
88
+ void print_usage(FILE *stream) {
89
+ fprintf(stream, "\
90
+ Usage: onion [OPTIONS] [FILE]\n\
91
+ Mark duplicate text parts in the input vertical file.\n\
92
+ \n\
93
+ -f FILE hashes of duplicate n-grams\n\
94
+ -n NUM n-gram length (default: %i)\n\
95
+ -t NUM duplicate content threshold (default: %.1f)\n\
96
+ -d STR document tag (default: %s)\n\
97
+ -p STR paragraph tag (default: %s)\n\
98
+ -s strip duplicate parts (rather than mark)\n\
99
+ -m no smoothing\n\
100
+ -T NUM trim n-gram hashes to NUM bits (default: %i)\n\
101
+ -l NUM max stub length (default: %i)\n\
102
+ -b NUM buffer size, in bytes (default: %i)\n\
103
+ -q quiet; suppress all output except for errors\n\
104
+ \n\
105
+ -V print version information and exit\n\
106
+ -h display this help and exit\n\
107
+ \n\
108
+ With no FILE, or when FILE is -, read standard input.\n\
109
+ Output is written to standard output.\n\
110
+ \n\
111
+ Project home page: <http://code.google.com/p/onion/>\n",
112
+ NGRAM_SIZE, DUPL_THRES, DOC_TAG, PAR_TAG, TRIM_HASHES,
113
+ MAX_STUB_LENGTH, BUFFER_SIZE);
114
+ }
115
+
116
+ void print_progress(const char *task_descr, unsigned long int processed_bytes,
117
+ float percent_done) {
118
+ time_t now;
119
+ struct rusage usage;
120
+ time(&now);
121
+ getrusage(RUSAGE_SELF, &usage);
122
+ fprintf(stderr, "[%.24s] onion: %s: %6li MB processed", ctime(&now),
123
+ task_descr, processed_bytes / (1024 * 1024));
124
+ if (percent_done >= 0)
125
+ fprintf(stderr, " (%6.2f%%)", percent_done);
126
+ fprintf(stderr, "\t%6li MB RAM used", usage.ru_maxrss / 1024);
127
+ fprintf(stderr, "\n");
128
+ }
129
+
130
+ void saveGlobalHashmap(const ngrhash &global) {
131
+ std::ofstream outFile("path", std::ios::binary | std::ios::app);
132
+ if (!outFile.is_open()) {
133
+ throw std::runtime_error("Failed to open file for writing");
134
+ }
135
+
136
+ for (const auto &pair : global) {
137
+ outFile.write(reinterpret_cast<const char *>(&pair.first),
138
+ sizeof(pair.first));
139
+ outFile.write(reinterpret_cast<const char *>(&pair.second),
140
+ sizeof(pair.second));
141
+ }
142
+
143
+ outFile.close();
144
+ }
145
+
146
+ bool fileExists(const std::string &name) {
147
+ struct stat map;
148
+ return (stat(name.c_str(), &map) == 0);
149
+ }
150
+
151
+ void writeFilenameDuplicateToCSV(
152
+ const std::vector<std::pair<std::string, int>> &vec,
153
+ const std::string &filename) {
154
+ // Check if the file already exists
155
+ std::ifstream infile(filename);
156
+ bool fileExists = infile.is_open();
157
+ infile.close();
158
+
159
+ // Open the file in append mode if it exists, otherwise create a new file
160
+ std::ofstream file(filename, fileExists ? std::ios::app : std::ios::out);
161
+
162
+ if (!file.is_open()) {
163
+ std::cerr << "Failed to open the file: " << filename << std::endl;
164
+ return;
165
+ }
166
+
167
+ for (const auto &pair : vec) {
168
+ file << "\"" << pair.first << "\"," << pair.second
169
+ << "\n"; // Write the string in quotes and the integer.
170
+ }
171
+
172
+ file.close(); // Close the file after writing.
173
+ }
174
+
175
+ void writeFilenameDuplicateScoreToCSV(
176
+ const std::vector<std::pair<std::string, float>> &vec,
177
+ const std::string &filename) {
178
+ // Check if the file already exists
179
+ std::ifstream infile(filename);
180
+ bool fileExists = infile.is_open();
181
+ infile.close();
182
+
183
+ // Open the file in append mode if it exists, otherwise create a new file
184
+ std::ofstream file(filename, fileExists ? std::ios::app : std::ios::out);
185
+
186
+ if (!file.is_open()) {
187
+ std::cerr << "Failed to open the file: " << filename << std::endl;
188
+ return;
189
+ }
190
+
191
+ for (const auto &pair : vec) {
192
+ file << pair.first << "," << pair.second
193
+ << "\n"; // Write the integer and the float to the file.
194
+ }
195
+
196
+ file.close(); // Close the file after writing.
197
+ }
198
+
199
+ void writeUncheckedFilenamesToCSV(const std::vector<int> &vec,
200
+ const std::string &filename) {
201
+ // Check if the file already exists
202
+ std::ifstream infile(filename);
203
+ bool fileExists = infile.is_open();
204
+ infile.close();
205
+
206
+ // Open the file in append mode if it exists, otherwise create a new file
207
+ std::ofstream file(filename, fileExists ? std::ios::app : std::ios::out);
208
+
209
+ if (!file.is_open()) {
210
+ std::cerr << "Failed to open the file: " << filename << std::endl;
211
+ return;
212
+ }
213
+
214
+ for (const auto &num : vec) {
215
+ file << num << "\n"; // Write each integer to the file.
216
+ }
217
+
218
+ file.close(); // Close the file after writing.
219
+ }
220
+
221
+ int process_one_par(int *&pars, char **&tokens, buzhash_buffer_t &bh_buffer,
222
+ hash_t &hash_bitmask, ngrhash &local, ngrhash &global,
223
+ int &bad_tokens, int &tok_i, int &par_i, int &total_tokens,
224
+ int &prev_bad_tokens, int &have_dupl_ngrams) {
225
+ ngrhash::const_iterator it;
226
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
227
+ char *token = tokens[tok_i];
228
+ if (token[0] == '<')
229
+ continue;
230
+ total_tokens++;
231
+ prev_bad_tokens--;
232
+ if (prev_bad_tokens < 0)
233
+ prev_bad_tokens = 0;
234
+ hash_t hash = buzhash(token, &bh_buffer);
235
+ hash_t masked_hash = hash & hash_bitmask;
236
+ if (!buzhash_is_full_buffer(&bh_buffer))
237
+ continue;
238
+ it = local.find(hash);
239
+ if (it == local.end()) {
240
+ if (have_dupl_ngrams) {
241
+ // test with the last bit set to 1
242
+ // // (check against already seen duplicate ngrams)
243
+ it = global.find(masked_hash | 1);
244
+ } else {
245
+ it = global.find(masked_hash);
246
+ }
247
+ }
248
+ if (it != global.end()) {
249
+ bad_tokens += Ngram_size - prev_bad_tokens;
250
+ prev_bad_tokens = Ngram_size;
251
+ if (fileNameIndex != it->second) {
252
+ foundFilenames.push_back(
253
+ make_pair(current_file_name, it->second)); // Save the filename.
254
+ }
255
+ }
256
+ #ifdef GOOGLE_SPARSE
257
+ local.insert(std::make_pair(hash, fileNameIndex)); // local.insert(hash);
258
+ #else
259
+ local[hash] = fileNameIndex; // local[hash] = true;
260
+ #endif
261
+ }
262
+ }
263
+
264
+ int process_one_file(int &buffer_content, char **&tokens, int *&pars,
265
+ int *&par_len, char *&bad_par,
266
+
267
+ int *docs,
268
+
269
+ char *doc_tag, int doc_tag_len, char *doc_end_tag,
270
+ int doc_end_tag_len, char *par_tag, int par_tag_len,
271
+ char *par_end_tag, int par_end_tag_len,
272
+
273
+ int &have_dupl_ngrams,
274
+ unsigned long int &total_processed_bytes,
275
+
276
+ // make sure these become mutable
277
+ ngrhash &global, ngrhash &local,
278
+ buzhash_buffer_t &bh_buffer, char *buffer) {
279
+ int bytes_read = fread(buffer + buffer_content, sizeof(char),
280
+ Buffer_size - buffer_content, Input);
281
+ // print("Buffer %x, BufferContent %d, BufferSize %ld, \n", buffer,
282
+ // buffer_content, Buffer_size);
283
+ hash_t hash_bitmask = 0xfffffffffffffffful;
284
+
285
+ int buffer_size = buffer_content + bytes_read;
286
+ buffer[buffer_size] = '\0'; // make it a string
287
+ char *buffer_pos = buffer;
288
+
289
+ // find tokens
290
+ int token_count = 0;
291
+ tokens[token_count++] = buffer_pos++;
292
+ while ((buffer_pos = strchr(buffer_pos, '\n')) != NULL) {
293
+ buffer_pos[0] = '\0';
294
+ tokens[token_count++] = ++buffer_pos;
295
+ }
296
+
297
+ // find docs and paragraphs
298
+ int doc_count = 0;
299
+ int par_count = 0;
300
+ docs[doc_count++] = 0;
301
+ pars[par_count++] = 0;
302
+ int start_doc_next = 0;
303
+ int start_par_next = 0;
304
+ int i;
305
+
306
+ for (i = 1; i < token_count; i++) {
307
+ // "<doc>" or "<doc "
308
+ if (start_doc_next ||
309
+ (strncmp(tokens[i], doc_tag, doc_tag_len) == 0 &&
310
+ (tokens[i][doc_tag_len] == ' ' || tokens[i][doc_tag_len] == '>'))) {
311
+ docs[doc_count++] = par_count;
312
+ pars[par_count++] = i;
313
+ start_doc_next = 0;
314
+ }
315
+ // "</doc>"
316
+ else if (strncmp(tokens[i], doc_end_tag, doc_end_tag_len) == 0) {
317
+ start_doc_next = 1;
318
+ }
319
+ // "<p>" or "<p "
320
+ else if (start_par_next || (strncmp(tokens[i], par_tag, par_tag_len) == 0 &&
321
+ (tokens[i][par_tag_len] == ' ' ||
322
+ tokens[i][par_tag_len] == '>'))) {
323
+ pars[par_count++] = i;
324
+ start_par_next = 0;
325
+ }
326
+ // "</p>"
327
+ else if (strncmp(tokens[i], par_end_tag, par_end_tag_len) == 0) {
328
+ start_par_next = 1;
329
+ }
330
+ }
331
+
332
+ if (doc_count == 1 && !feof(Input)) {
333
+ // full buffer contains only one document
334
+ // by default, documents with higher than 16MB are not checked for dedup
335
+ fprintf(stderr, "Too long document at byte %li.\n", total_processed_bytes);
336
+ UncheckedFilenames.push_back(fileNameIndex);
337
+ return 1;
338
+ }
339
+
340
+ if (feof(Input)) {
341
+ // create sentinels
342
+ docs[doc_count++] = par_count;
343
+ if (strlen(tokens[token_count - 1]) == 0) {
344
+ // files ending with a newline have a natural sentinel
345
+ // (the last zero-length token)
346
+ pars[par_count++] = token_count - 1;
347
+ } else {
348
+ // for other files, we need to make up the last token
349
+ pars[par_count++] = token_count;
350
+ tokens[token_count++] = buffer + buffer_size;
351
+ }
352
+ }
353
+
354
+ // for all documents
355
+ int doc_i;
356
+ for (doc_i = 0; doc_i < doc_count - 1; doc_i++) {
357
+ buzhash_clear_buffer(&bh_buffer);
358
+ local.clear();
359
+ // for all paragraphs in the document
360
+ int par_i;
361
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
362
+ int total_tokens = 0;
363
+ int bad_tokens = 0;
364
+ /* prev_bad_tokens is the number of tokens in the current
365
+ * n-gram which are contained in one of the previous bad
366
+ * n-grams.
367
+ *
368
+ * At the beginning of a new paragraph we need to pretend that
369
+ * there are Ngram_size prev_bad_tokens so that the leading
370
+ * bad n-grams do not generate too many bod tokens. */
371
+ int prev_bad_tokens = Ngram_size;
372
+ // for all tokens in the paragraph
373
+ // for every token in the current paragraph, it computes its hash and
374
+ // checks if it is present in the local and global hash maps
375
+ int tok_i;
376
+ float score;
377
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
378
+ process_one_par(pars, tokens, bh_buffer, hash_bitmask, local, global,
379
+ bad_tokens, tok_i, par_i, total_tokens, prev_bad_tokens,
380
+ have_dupl_ngrams);
381
+ }
382
+
383
+ // remember the length of the paragraph
384
+ par_len[par_i] = total_tokens;
385
+
386
+ // mark bad paragraphs
387
+ bad_par[par_i] =
388
+ (total_tokens > 0 && (1.0 * bad_tokens / total_tokens) > Dupl_thres);
389
+
390
+ // get score for each file
391
+ score = (1.0 * bad_tokens / total_tokens);
392
+ DuplicateFilenamesScores.push_back(make_pair(current_file_name, score));
393
+ // DuplicateFilenamesScores.push_back(make_pair(fileNameIndex, score));
394
+ }
395
+
396
+ // smoothing
397
+ if (!No_smoothing) {
398
+ int last_bad_par = docs[doc_i] - 1;
399
+ int stub_length = 0;
400
+ for (par_i = docs[doc_i]; par_i <= docs[doc_i + 1]; par_i++) {
401
+ if (par_i == docs[doc_i + 1] || bad_par[par_i]) {
402
+ if (stub_length <= Max_stub_length) {
403
+ // remove stub
404
+ int par_j;
405
+ for (par_j = last_bad_par + 1; par_j < par_i; par_j++)
406
+ bad_par[par_j] = 1;
407
+ }
408
+ last_bad_par = par_i;
409
+ stub_length = 0;
410
+ } else {
411
+ stub_length += par_len[par_i];
412
+ }
413
+ }
414
+ }
415
+
416
+ int count_bad = 0;
417
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
418
+ if (!bad_par[par_i] && par_len[par_i] > 0) {
419
+ } else {
420
+ count_bad += 1;
421
+ }
422
+ }
423
+ if (count_bad > 0) {
424
+ std::sort(foundFilenames.begin(), foundFilenames.end());
425
+ // auto uniqueEnd = std::unique(foundFilenames.begin(),
426
+ // foundFilenames.end()); foundFilenames.erase(uniqueEnd,
427
+ // foundFilenames.end());
428
+ printf("%s is %d bad\n", current_file_name, count_bad);
429
+ DuplicateFilenames.insert(DuplicateFilenames.end(),
430
+ foundFilenames.begin(), foundFilenames.end());
431
+ // DuplicateFilenames.push_back(current_file_name);
432
+ }
433
+
434
+ foundFilenames.clear();
435
+
436
+ // is there at least one good paragraph?
437
+ int all_bad = 1;
438
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
439
+ if (!bad_par[par_i] && par_len[par_i] > 0) {
440
+ all_bad = 0;
441
+ break;
442
+ }
443
+ }
444
+
445
+ buzhash_clear_buffer(&bh_buffer);
446
+ // for all paragraphs in the document (again)
447
+ for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
448
+ int first_token = pars[docs[doc_i]];
449
+ int last_token = pars[docs[doc_i + 1]] - 1;
450
+ // for all tokens in the paragraph
451
+ int tok_i;
452
+ for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
453
+ char *token = tokens[tok_i];
454
+ int bad_token = bad_par[par_i];
455
+ // tags at document boundaries are preserved if there is
456
+ // at least one good paragraph, removed otherwise
457
+ if (tok_i == first_token && strncmp(token, doc_tag, doc_tag_len) == 0)
458
+ bad_token = all_bad;
459
+ if (tok_i == last_token && strcmp(token, doc_end_tag) == 0)
460
+ bad_token = all_bad;
461
+ // print output
462
+ // if (Strip_dupl) {
463
+ // if (!bad_token)
464
+ // printf("%s\n", token);
465
+ //}
466
+ // else {
467
+ // printf("%i\t%s\n", bad_token, token);
468
+ //}
469
+ if (token[0] == '<')
470
+ continue;
471
+ // store hashes of n-grams
472
+ hash_t hash = buzhash(token, &bh_buffer);
473
+ hash_t masked_hash = hash & hash_bitmask;
474
+ if (!buzhash_is_full_buffer(&bh_buffer))
475
+ continue;
476
+ if (!bad_par[par_i]) {
477
+ if (have_dupl_ngrams) {
478
+ // If we have the list of hashes of all duplicate
479
+ // n-grams, we set the least significant bit of the
480
+ // stored hash to 1 if we have seen the matching
481
+ // duplicate n-gram to indicate it has been seen.
482
+ // Unique n-grams are ignored.
483
+ if (global.erase(masked_hash & BITMASK_HIGH63))
484
+ #ifdef GOOGLE_SPARSE
485
+ global.insert(std::make_pair(masked_hash | 1, fileNameIndex));
486
+ // global.insert (masked_hash | 1);
487
+ #else
488
+ global[masked_hash | 1] =
489
+ fileNameIndex; // global[masked_hash | 1] = true;
490
+ #endif
491
+ } else {
492
+ // otherwise we have to store hashes of all n-grams
493
+ #ifdef GOOGLE_SPARSE
494
+ global.insert(std::make_pair(
495
+ masked_hash, fileNameIndex)); // global.insert(masked_hash);
496
+ #else
497
+ global[masked_hash] = fileNameIndex; // global[masked_hash] = true;
498
+ #endif
499
+ }
500
+ }
501
+ }
502
+ }
503
+ }
504
+ printf("before feof input \n");
505
+
506
+ // copy the unprocessed data to the beginning of the buffer
507
+ if (!feof(Input)) {
508
+ char *last_doc_fst_tok = tokens[pars[docs[doc_count - 1]]];
509
+ int processed_bytes = last_doc_fst_tok - buffer;
510
+ total_processed_bytes += processed_bytes;
511
+ int remaining_bytes = buffer_size - processed_bytes;
512
+ char *buffer_end = last_doc_fst_tok + remaining_bytes;
513
+ char *pos;
514
+ // replace \0s with EOLs (revert the buffer contents to original)
515
+ for (pos = last_doc_fst_tok; pos < buffer_end; pos++)
516
+ if (pos[0] == '\0')
517
+ pos[0] = '\n';
518
+ // print progress information
519
+ if (!Quiet) {
520
+ float percent_done = -1;
521
+ if (Input_size > 0)
522
+ percent_done = 100.0 * total_processed_bytes / Input_size;
523
+ print_progress("removing duplicates", total_processed_bytes,
524
+ percent_done);
525
+ }
526
+ memmove(buffer, last_doc_fst_tok, remaining_bytes);
527
+ buffer_content = remaining_bytes;
528
+ }
529
+ printf("end of process_one_file\n");
530
+ }
531
+
532
+ int main(int argc, char **argv) {
533
+ printf("Dupl Threshold %f \n", Dupl_thres);
534
+ printf("N-gram size %d \n", Ngram_size);
535
+ // get options
536
+ int c;
537
+ char *endptr;
538
+ while ((c = getopt(argc, argv, "f:n:t:d:p:smT:l:b:qVh")) != -1) {
539
+ errno = 0;
540
+ switch (c) {
541
+ case 'f':
542
+ Dupl_hashes_path = optarg;
543
+ break;
544
+ case 'n':
545
+ Ngram_size = strtol(optarg, &endptr, 10);
546
+ if (errno != 0 || *endptr != '\0') {
547
+ fprintf(stderr, "Integer value expected for -n, got: %s\n", optarg);
548
+ print_usage(stderr);
549
+ return 1;
550
+ }
551
+ break;
552
+ case 't':
553
+ Dupl_thres = strtod(optarg, &endptr);
554
+ if (errno != 0 || *endptr != '\0') {
555
+ fprintf(stderr, "Float value expected for -t, got: %s\n", optarg);
556
+ print_usage(stderr);
557
+ return 1;
558
+ }
559
+ break;
560
+ case 'd':
561
+ Doc_tag = optarg;
562
+ break;
563
+ case 'p':
564
+ Par_tag = optarg;
565
+ break;
566
+ case 's':
567
+ Strip_dupl = 1;
568
+ break;
569
+ case 'm':
570
+ No_smoothing = 1;
571
+ break;
572
+ case 'T':
573
+ Trim_hashes = strtol(optarg, &endptr, 10);
574
+ if (errno != 0 || *endptr != '\0') {
575
+ fprintf(stderr, "Integer value expected for -T, got: %s\n", optarg);
576
+ print_usage(stderr);
577
+ return 1;
578
+ }
579
+ break;
580
+ case 'l':
581
+ Max_stub_length = strtol(optarg, &endptr, 10);
582
+ if (errno != 0 || *endptr != '\0') {
583
+ fprintf(stderr, "Integer value expected for -l, got: %s\n", optarg);
584
+ print_usage(stderr);
585
+ return 1;
586
+ }
587
+ break;
588
+ case 'b':
589
+ Buffer_size = strtol(optarg, &endptr, 10);
590
+ if (errno != 0 || *endptr != '\0') {
591
+ fprintf(stderr, "Integer value expected for -b, got: %s\n", optarg);
592
+ print_usage(stderr);
593
+ return 1;
594
+ }
595
+ break;
596
+ case 'q':
597
+ Quiet = 1;
598
+ break;
599
+ case 'V':
600
+ print_version("onion");
601
+ return 0;
602
+ case 'h':
603
+ print_usage(stdout);
604
+ return 0;
605
+ case '?':
606
+ print_usage(stderr);
607
+ return 1;
608
+ }
609
+ }
610
+
611
+ Input = stdin;
612
+ Input_size = -1;
613
+ if (optind < argc) {
614
+ char *filename = argv[optind];
615
+ if (strcmp(filename, "-") != 0) {
616
+ errno = 0;
617
+ Input = fopen(filename, "r");
618
+ if (errno != 0) {
619
+ fprintf(stderr, "Unable to open %s for reading.\n", filename);
620
+ return 1;
621
+ }
622
+ fseek(Input, 0L, SEEK_END);
623
+ Input_size = ftell(Input);
624
+ fseek(Input, 0L, SEEK_SET);
625
+ }
626
+ }
627
+
628
+ unsigned long int total_processed_bytes = 0;
629
+
630
+ // patterns
631
+ char *doc_tag = (char *)malloc((strlen(Doc_tag) + 1 + 1) * sizeof(char));
632
+ strcat(strcpy(doc_tag, "<"), Doc_tag);
633
+ char *doc_end_tag = (char *)malloc((strlen(Doc_tag) + 3 + 1) * sizeof(char));
634
+ strcat(strcat(strcpy(doc_end_tag, "</"), Doc_tag), ">");
635
+ char *par_tag = (char *)malloc((strlen(Par_tag) + 1 + 1) * sizeof(char));
636
+ strcat(strcpy(par_tag, "<"), Par_tag);
637
+ char *par_end_tag = (char *)malloc((strlen(Par_tag) + 3 + 1) * sizeof(char));
638
+ strcat(strcat(strcpy(par_end_tag, "</"), Par_tag), ">");
639
+
640
+ int doc_tag_len = strlen(doc_tag);
641
+ int doc_end_tag_len = strlen(doc_end_tag);
642
+ int par_tag_len = strlen(par_tag);
643
+ int par_end_tag_len = strlen(par_end_tag);
644
+
645
+ // bitmask for trimming ngram hashes
646
+ hash_t hash_bitmask = 0xfffffffffffffffful;
647
+ int bitshift = 64 - Trim_hashes;
648
+ if (bitshift > 0)
649
+ hash_bitmask >>= bitshift;
650
+
651
+ // data structures
652
+ int buffer_size = 0;
653
+ int buffer_content = 0;
654
+ char *buffer = (char *)malloc((Buffer_size + 1) * sizeof(char));
655
+ char **tokens = (char **)malloc((Buffer_size + 1) * sizeof(char *));
656
+ int *pars = (int *)malloc((Buffer_size + 1) *
657
+ sizeof(int)); // array of starting tokens
658
+ int *par_len = (int *)malloc((Buffer_size + 1) * sizeof(int));
659
+ char *bad_par = (char *)malloc((Buffer_size + 1) * sizeof(char));
660
+ int *docs =
661
+ (int *)malloc((Buffer_size + 1) * sizeof(int)); // array of starting pars
662
+ int token_count, par_count, doc_count;
663
+
664
+ // buzhash
665
+ buzhash_buffer_t bh_buffer;
666
+ buzhash_init_buffer(&bh_buffer, Ngram_size);
667
+
668
+ // global hash table stores the hashes of all files read so far (in our case
669
+ // the full document since we did not mark paragraph separators) local hash
670
+ // table stores the hahses of n-grams found within the currently processed
671
+ // file (doc as a whole in our case)
672
+ ngrhash global, local;
673
+ #ifdef GOOGLE_SPARSE
674
+ global.set_deleted_key(0);
675
+ local.set_deleted_key(0);
676
+ #endif
677
+
678
+ // read hashes of duplicate n-grams if available
679
+ int have_dupl_ngrams = 0;
680
+ if (Dupl_hashes_path != NULL) {
681
+ have_dupl_ngrams = 1;
682
+ errno = 0;
683
+ FILE *ngrams_fp = fopen(Dupl_hashes_path, "r");
684
+ if (errno != 0) {
685
+ fprintf(stderr, "Unable to open %s for reading.\n", Dupl_hashes_path);
686
+ return 1;
687
+ }
688
+ fseek(ngrams_fp, 0L, SEEK_END);
689
+ unsigned long int ngrams_size = ftell(ngrams_fp);
690
+ fseek(ngrams_fp, 0L, SEEK_SET);
691
+
692
+ unsigned long int bytes_read = 0;
693
+ hash_t hash;
694
+ while (fread(&hash, sizeof(hash), 1, ngrams_fp)) {
695
+ printf("reading");
696
+ bytes_read += sizeof(hash);
697
+ hash_t masked_hash = hash & hash_bitmask;
698
+ // store only the 63 most significant bits of the hash;
699
+ // reserve the last bit as a flag (seen / unseen)
700
+ // #ifdef GOOGLE_SPARSE
701
+ // global.insert(std::make_pair(masked_hash | 1,
702
+ // fileNameIndex));//global.insert (masked_hash & BITMASK_HIGH63); #else
703
+ // global[masked_hash & BITMASK_HIGH63] = true;
704
+ // #endif
705
+
706
+ // print progress information
707
+ if (!Quiet && bytes_read % (10000000 * sizeof(hash)) == 0) {
708
+ float percent_done = -1;
709
+ if (ngrams_size > 0)
710
+ percent_done = 100.0 * bytes_read / ngrams_size;
711
+ print_progress("reading hashes", bytes_read, percent_done);
712
+ }
713
+ }
714
+
715
+ if (!Quiet)
716
+ print_progress("reading hashes", bytes_read, 100);
717
+ }
718
+
719
+ int fileIndex =
720
+ 0; // File index t store every 300 duplicates in a new csv file
721
+ const char *datasetname = "Duplicates"; // Name of the dataset
722
+ FILE *List_of_Files = fopen("duplicates_paths.txt", "r");
723
+ if (List_of_Files == NULL) {
724
+ perror("Unable to open list of files!");
725
+ exit(1);
726
+ }
727
+
728
+ while (!feof(List_of_Files)) {
729
+ fileNameIndex++;
730
+ /* read the name of each file in the list of files*/
731
+ if (fgets(current_file_name, sizeof(current_file_name), List_of_Files) ==
732
+ NULL) {
733
+ break;
734
+ }
735
+ int last_ch = current_file_name[strlen(current_file_name) - 1];
736
+ if (last_ch == '\n') {
737
+ current_file_name[strlen(current_file_name) - 1] = '\0';
738
+ }
739
+ /* use fopen to assign Input to it*/
740
+ Input = fopen(current_file_name, "r");
741
+ if (Input == NULL) {
742
+ char err_msg[1024];
743
+ snprintf(err_msg, 1023, "Error to open data file %s--",
744
+ current_file_name);
745
+ perror(err_msg);
746
+ exit(1);
747
+ continue;
748
+ }
749
+
750
+ printf("File Number: %d \n", fileNameIndex);
751
+
752
+ /* Process it as below*/
753
+ // it modifies the data structures passed to it as arguments, processes the
754
+ // input file, and updates global state
755
+ process_one_file(buffer_content, /* int */
756
+ tokens, /* char** */
757
+ pars, /* int* */
758
+ par_len, /* int* */
759
+ bad_par, /* char* */
760
+
761
+ docs, /* int* */
762
+
763
+ doc_tag, /* char* */
764
+ doc_tag_len, /* int */
765
+ doc_end_tag, /* char* */
766
+ doc_end_tag_len, /* int */
767
+ par_tag, /* char* */
768
+ par_tag_len, /* int */
769
+ par_end_tag, /* char* */
770
+ par_end_tag_len, /* int */
771
+
772
+ have_dupl_ngrams, /* long int int */
773
+ total_processed_bytes, /* unsigned */
774
+ // make sure these become mutable
775
+ global, /* ngrhash */
776
+ local, /* ngrhash */
777
+ bh_buffer, /* buzhash_buffer_t */
778
+ buffer /* char* */
779
+ );
780
+ printf("after calling process_one_file\n");
781
+
782
+ if (DuplicateFilenames.size() > 2000) {
783
+ fileIndex++;
784
+ std::string filename = "Duplicate_pair_files_" +
785
+ std::string(datasetname) + "_" +
786
+ to_string(fileIndex) + ".csv";
787
+ writeFilenameDuplicateToCSV(DuplicateFilenames, filename);
788
+ // writeDuplicatesToCSV(DuplicateFilenames, "duplicated.csv");
789
+ // foundFilenames.clear();
790
+ writeFilenameDuplicateScoreToCSV(DuplicateFilenamesScores,
791
+ "Duplicates_scores.csv");
792
+ DuplicateFilenames.clear();
793
+ DuplicateFilenamesScores.clear();
794
+ printf(" 1000 Duplicated files are saved\n");
795
+ }
796
+
797
+ fclose(Input);
798
+ }
799
+
800
+ // write the remaining duplicate files
801
+ fileIndex++;
802
+ std::string filename = "Duplicate_pair_files_" + std::string(datasetname) +
803
+ "_" + to_string(fileIndex) + ".csv";
804
+ writeFilenameDuplicateToCSV(DuplicateFilenames, filename);
805
+ std::string name = "Unchecked_files_" + std::string(datasetname) + "_" +
806
+ to_string(fileIndex) + ".csv";
807
+ writeUncheckedFilenamesToCSV(UncheckedFilenames, name);
808
+
809
+ // print progress information
810
+ total_processed_bytes += buffer_size;
811
+ if (!Quiet)
812
+ print_progress("removing duplicates", total_processed_bytes, 100);
813
+
814
+ // if (Input != stdin)
815
+ // fclose(Input);
816
+
817
+ // save the global hash map
818
+ // saveGlobalHashmap(global); // This will append to the existing file or
819
+ // create a new one if not present
820
+
821
+ return 0;
822
+ }
823
+
824
+ // vim: ts=4 sw=4 sta et sts=4 si cindent tw=80: