dalla-data-processing 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dalla/__init__.py +27 -0
- dalla/cli.py +453 -0
- dalla/core/__init__.py +6 -0
- dalla/core/dataset.py +387 -0
- dalla/core/parallel.py +279 -0
- dalla/deduplication/__init__.py +370 -0
- dalla/deduplication/bin/.gitignore +1 -0
- dalla/deduplication/bin/onion-linux-x86_64 +0 -0
- dalla/deduplication/onion/COPYING +24 -0
- dalla/deduplication/onion/Makefile +21 -0
- dalla/deduplication/onion/Makefile.config +3 -0
- dalla/deduplication/onion/README.md +21 -0
- dalla/deduplication/onion/src/Makefile +22 -0
- dalla/deduplication/onion/src/Makefile.g +23 -0
- dalla/deduplication/onion/src/buzhash.c +325 -0
- dalla/deduplication/onion/src/buzhash.h +30 -0
- dalla/deduplication/onion/src/hashdup.c +172 -0
- dalla/deduplication/onion/src/hashgen.c +206 -0
- dalla/deduplication/onion/src/onion +0 -0
- dalla/deduplication/onion/src/onion.c +799 -0
- dalla/deduplication/onion/src/onion_dup.c +824 -0
- dalla/deduplication/onion/src/version.c +17 -0
- dalla/deduplication/onion/src/version.h +10 -0
- dalla/deduplication/onion/src_sc/Makefile +22 -0
- dalla/deduplication/onion/src_sc/Makefile.g +23 -0
- dalla/deduplication/onion/src_sc/buzhash.c +325 -0
- dalla/deduplication/onion/src_sc/buzhash.h +30 -0
- dalla/deduplication/onion/src_sc/hashdup +0 -0
- dalla/deduplication/onion/src_sc/hashdup.c +172 -0
- dalla/deduplication/onion/src_sc/hashgen +0 -0
- dalla/deduplication/onion/src_sc/hashgen.c +206 -0
- dalla/deduplication/onion/src_sc/onion.c +854 -0
- dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
- dalla/deduplication/onion/src_sc/version.c +17 -0
- dalla/deduplication/onion/src_sc/version.h +10 -0
- dalla/deduplication/onion_wrapper.py +223 -0
- dalla/deduplication/postprocessing.py +216 -0
- dalla/deduplication/preprocessing.py +120 -0
- dalla/quality/__init__.py +5 -0
- dalla/quality/checker.py +354 -0
- dalla/readability/__init__.py +197 -0
- dalla/readability/ranking.py +165 -0
- dalla/readability/scorer.py +148 -0
- dalla/stemming/__init__.py +551 -0
- dalla/stemming/data/words_al.txt +3414 -0
- dalla/stemming/data/words_al_t.txt +885 -0
- dalla/stemming/data/words_t.txt +7 -0
- dalla/utils/__init__.py +10 -0
- dalla/utils/logger.py +128 -0
- dalla/utils/tokenize.py +89 -0
- dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
- dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
- dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
- dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
- dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,799 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2020 Jan Pomikalek, Milos Jakubicek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************
|
|
8
|
+
* *
|
|
9
|
+
* Refactored in 2023 by Sally Choker and Fadi Zaraket. *
|
|
10
|
+
* Modifications: *
|
|
11
|
+
* - Adapted to take as input a list of file paths instead of *
|
|
12
|
+
* a single file. *
|
|
13
|
+
* - Enhanced to detect the source of duplicate files. *
|
|
14
|
+
*********************************************************************/
|
|
15
|
+
|
|
16
|
+
#include "buzhash.h"
|
|
17
|
+
#include "version.h"
|
|
18
|
+
#include <algorithm>
|
|
19
|
+
#include <errno.h>
|
|
20
|
+
#include <fstream>
|
|
21
|
+
#include <iostream>
|
|
22
|
+
#include <sstream>
|
|
23
|
+
#include <stdio.h>
|
|
24
|
+
#include <string.h>
|
|
25
|
+
#include <sys/resource.h>
|
|
26
|
+
#include <sys/stat.h>
|
|
27
|
+
#include <sys/time.h>
|
|
28
|
+
#include <time.h>
|
|
29
|
+
#include <unistd.h>
|
|
30
|
+
#include <unordered_map>
|
|
31
|
+
#include <vector>
|
|
32
|
+
|
|
33
|
+
#define GOOGLE_SPARSE
|
|
34
|
+
|
|
35
|
+
#if defined GOOGLE_SPARSE
|
|
36
|
+
#include <sparsehash/sparse_hash_map>
|
|
37
|
+
// #include <sparsehash/sparse_hash_set>
|
|
38
|
+
// using google::sparse_hash_set;
|
|
39
|
+
#elif defined __GNUC__ || defined __APPLE__
|
|
40
|
+
#include <ext/hash_map>
|
|
41
|
+
namespace std { using namespace __gnu_cxx; }
|
|
42
|
+
#else
|
|
43
|
+
#include <hash_map>
|
|
44
|
+
#endif
|
|
45
|
+
using namespace std;
|
|
46
|
+
|
|
47
|
+
#ifdef GOOGLE_SPARSE
|
|
48
|
+
// typedef sparse_hash_set<uint64_t> ngrhash;
|
|
49
|
+
typedef google::sparse_hash_map<uint64_t, int> ngrhash;
|
|
50
|
+
#else
|
|
51
|
+
// typedef hash_map<uint64_t,bool> ngrhash;
|
|
52
|
+
typedef hash_map<uint64_t, int> ngrhash;
|
|
53
|
+
#endif
|
|
54
|
+
|
|
55
|
+
struct DupVal {
|
|
56
|
+
string original_f_name; // extracted from the global map
|
|
57
|
+
string dup_f_name; // the current file name
|
|
58
|
+
int matches;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
// initialize vector to store pair filenames and duplicates of all files
|
|
62
|
+
std::vector<std::pair<string, int>> foundFilenames;
|
|
63
|
+
|
|
64
|
+
// initailize vector to store only pairs of duplicated filenames
|
|
65
|
+
std::vector<std::pair<string, int>> DuplicateFilenames;
|
|
66
|
+
// initialize a vector to store documents not checked for deduplication (don't
|
|
67
|
+
// fit in the buffer)
|
|
68
|
+
std::vector<int> UncheckedFilenames;
|
|
69
|
+
|
|
70
|
+
#define BITMASK_HIGH63 0xfffffffffffffffeul
|
|
71
|
+
|
|
72
|
+
#define NGRAM_SIZE 7
|
|
73
|
+
#define DUPL_THRES 0.6
|
|
74
|
+
#define DOC_TAG "doc"
|
|
75
|
+
#define PAR_TAG "p"
|
|
76
|
+
#define TRIM_HASHES 64
|
|
77
|
+
#define MAX_STUB_LENGTH 20
|
|
78
|
+
#define BUFFER_SIZE 335544320
|
|
79
|
+
|
|
80
|
+
// options
|
|
81
|
+
int Ngram_size = NGRAM_SIZE;
|
|
82
|
+
float Dupl_thres = DUPL_THRES;
|
|
83
|
+
const char *Doc_tag = DOC_TAG;
|
|
84
|
+
const char *Par_tag = PAR_TAG;
|
|
85
|
+
int Strip_dupl = 0;
|
|
86
|
+
int No_smoothing = 0;
|
|
87
|
+
int Trim_hashes = TRIM_HASHES;
|
|
88
|
+
int Max_stub_length = MAX_STUB_LENGTH;
|
|
89
|
+
long Buffer_size = BUFFER_SIZE;
|
|
90
|
+
int Quiet = 0;
|
|
91
|
+
char *Dupl_hashes_path = NULL;
|
|
92
|
+
FILE *Input;
|
|
93
|
+
long int Input_size;
|
|
94
|
+
char current_file_name[2048];
|
|
95
|
+
int fileNameIndex = 0;
|
|
96
|
+
|
|
97
|
+
void print_usage(FILE *stream) {
|
|
98
|
+
fprintf(stream, "\
|
|
99
|
+
Usage: onion [OPTIONS] [FILE]\n\
|
|
100
|
+
Mark duplicate text parts in the input vertical file.\n\
|
|
101
|
+
\n\
|
|
102
|
+
-f FILE hashes of duplicate n-grams\n\
|
|
103
|
+
-n NUM n-gram length (default: %i)\n\
|
|
104
|
+
-t NUM duplicate content threshold (default: %.1f)\n\
|
|
105
|
+
-d STR document tag (default: %s)\n\
|
|
106
|
+
-p STR paragraph tag (default: %s)\n\
|
|
107
|
+
-s strip duplicate parts (rather than mark)\n\
|
|
108
|
+
-m no smoothing\n\
|
|
109
|
+
-T NUM trim n-gram hashes to NUM bits (default: %i)\n\
|
|
110
|
+
-l NUM max stub length (default: %i)\n\
|
|
111
|
+
-b NUM buffer size, in bytes (default: %i)\n\
|
|
112
|
+
-q quiet; suppress all output except for errors\n\
|
|
113
|
+
\n\
|
|
114
|
+
-V print version information and exit\n\
|
|
115
|
+
-h display this help and exit\n\
|
|
116
|
+
\n\
|
|
117
|
+
With no FILE, or when FILE is -, read standard input.\n\
|
|
118
|
+
Output is written to standard output.\n\
|
|
119
|
+
\n\
|
|
120
|
+
Project home page: <http://code.google.com/p/onion/>\n",
|
|
121
|
+
NGRAM_SIZE, DUPL_THRES, DOC_TAG, PAR_TAG, TRIM_HASHES,
|
|
122
|
+
MAX_STUB_LENGTH, BUFFER_SIZE);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
void print_progress(const char *task_descr, unsigned long int processed_bytes,
|
|
126
|
+
float percent_done) {
|
|
127
|
+
time_t now;
|
|
128
|
+
struct rusage usage;
|
|
129
|
+
time(&now);
|
|
130
|
+
getrusage(RUSAGE_SELF, &usage);
|
|
131
|
+
fprintf(stderr, "[%.24s] onion: %s: %6li MB processed", ctime(&now),
|
|
132
|
+
task_descr, processed_bytes / (1024 * 1024));
|
|
133
|
+
if (percent_done >= 0)
|
|
134
|
+
fprintf(stderr, " (%6.2f%%)", percent_done);
|
|
135
|
+
fprintf(stderr, "\t%6li MB RAM used", usage.ru_maxrss / 1024);
|
|
136
|
+
fprintf(stderr, "\n");
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
void saveGlobalHashmap(const ngrhash &global) {
|
|
140
|
+
std::ofstream outFile("path", std::ios::binary | std::ios::app);
|
|
141
|
+
if (!outFile.is_open()) {
|
|
142
|
+
throw std::runtime_error("Failed to open file for writing");
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
for (const auto &pair : global) {
|
|
146
|
+
outFile.write(reinterpret_cast<const char *>(&pair.first),
|
|
147
|
+
sizeof(pair.first));
|
|
148
|
+
outFile.write(reinterpret_cast<const char *>(&pair.second),
|
|
149
|
+
sizeof(pair.second));
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
outFile.close();
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
bool fileExists(const std::string &name) {
|
|
156
|
+
struct stat map;
|
|
157
|
+
return (stat(name.c_str(), &map) == 0);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
void writeFilenameDuplicateToCSV(
|
|
161
|
+
const std::vector<std::pair<std::string, int>> &vec,
|
|
162
|
+
const std::string &fullPath) {
|
|
163
|
+
std::ifstream infile(fullPath);
|
|
164
|
+
bool fileExists = infile.is_open();
|
|
165
|
+
infile.close();
|
|
166
|
+
|
|
167
|
+
std::ofstream file(fullPath, fileExists ? std::ios::app : std::ios::out);
|
|
168
|
+
if (!file.is_open()) {
|
|
169
|
+
std::cerr << "Failed to open the file: " << fullPath << std::endl;
|
|
170
|
+
return;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
for (const auto &pair : vec) {
|
|
174
|
+
file << "\"" << pair.first << "\"," << pair.second << "\n";
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
file.close();
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
void writeUncheckedFilenamesToCSV(const std::vector<int> &vec,
|
|
181
|
+
const std::string &fullPath) {
|
|
182
|
+
std::ifstream infile(fullPath);
|
|
183
|
+
bool fileExists = infile.is_open();
|
|
184
|
+
infile.close();
|
|
185
|
+
|
|
186
|
+
std::ofstream file(fullPath, fileExists ? std::ios::app : std::ios::out);
|
|
187
|
+
if (!file.is_open()) {
|
|
188
|
+
std::cerr << "Failed to open the file: " << fullPath << std::endl;
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
for (const auto &num : vec) {
|
|
193
|
+
file << num << "\n";
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
file.close();
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
int process_one_par(int *&pars, char **&tokens, buzhash_buffer_t &bh_buffer,
|
|
200
|
+
hash_t &hash_bitmask, ngrhash &local, ngrhash &global,
|
|
201
|
+
int &bad_tokens, int &tok_i, int &par_i, int &total_tokens,
|
|
202
|
+
int &prev_bad_tokens, int &have_dupl_ngrams) {
|
|
203
|
+
ngrhash::const_iterator it;
|
|
204
|
+
for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
|
|
205
|
+
char *token = tokens[tok_i];
|
|
206
|
+
if (token[0] == '<')
|
|
207
|
+
continue;
|
|
208
|
+
total_tokens++;
|
|
209
|
+
prev_bad_tokens--;
|
|
210
|
+
if (prev_bad_tokens < 0)
|
|
211
|
+
prev_bad_tokens = 0;
|
|
212
|
+
hash_t hash = buzhash(token, &bh_buffer);
|
|
213
|
+
hash_t masked_hash = hash & hash_bitmask;
|
|
214
|
+
if (!buzhash_is_full_buffer(&bh_buffer))
|
|
215
|
+
continue;
|
|
216
|
+
it = local.find(hash);
|
|
217
|
+
if (it == local.end()) {
|
|
218
|
+
if (have_dupl_ngrams) {
|
|
219
|
+
// test with the last bit set to 1
|
|
220
|
+
// // (check against already seen duplicate ngrams)
|
|
221
|
+
it = global.find(masked_hash | 1);
|
|
222
|
+
} else {
|
|
223
|
+
it = global.find(masked_hash);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
if (it != global.end()) {
|
|
227
|
+
bad_tokens += Ngram_size - prev_bad_tokens;
|
|
228
|
+
prev_bad_tokens = Ngram_size;
|
|
229
|
+
if (fileNameIndex != it->second) {
|
|
230
|
+
foundFilenames.push_back(make_pair(current_file_name, it->second));
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
#ifdef GOOGLE_SPARSE
|
|
234
|
+
local.insert(std::make_pair(hash, fileNameIndex));
|
|
235
|
+
#else
|
|
236
|
+
local[hash] = fileNameIndex;
|
|
237
|
+
#endif
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
int process_one_file(int &buffer_content, char **&tokens, int *&pars,
|
|
242
|
+
int *&par_len, char *&bad_par,
|
|
243
|
+
|
|
244
|
+
int *docs,
|
|
245
|
+
|
|
246
|
+
char *doc_tag, int doc_tag_len, char *doc_end_tag,
|
|
247
|
+
int doc_end_tag_len, char *par_tag, int par_tag_len,
|
|
248
|
+
char *par_end_tag, int par_end_tag_len,
|
|
249
|
+
|
|
250
|
+
int &have_dupl_ngrams,
|
|
251
|
+
unsigned long int &total_processed_bytes,
|
|
252
|
+
|
|
253
|
+
// make sure these become mutable
|
|
254
|
+
ngrhash &global, ngrhash &local,
|
|
255
|
+
buzhash_buffer_t &bh_buffer, char *buffer) {
|
|
256
|
+
int bytes_read = fread(buffer + buffer_content, sizeof(char),
|
|
257
|
+
Buffer_size - buffer_content, Input);
|
|
258
|
+
// print("Buffer %x, BufferContent %d, BufferSize %ld, \n", buffer,
|
|
259
|
+
// buffer_content, Buffer_size);
|
|
260
|
+
hash_t hash_bitmask = 0xfffffffffffffffful;
|
|
261
|
+
|
|
262
|
+
int buffer_size = buffer_content + bytes_read;
|
|
263
|
+
buffer[buffer_size] = '\0'; // make it a string
|
|
264
|
+
char *buffer_pos = buffer;
|
|
265
|
+
|
|
266
|
+
// find tokens
|
|
267
|
+
int token_count = 0;
|
|
268
|
+
tokens[token_count++] = buffer_pos++;
|
|
269
|
+
while ((buffer_pos = strchr(buffer_pos, '\n')) != NULL) {
|
|
270
|
+
buffer_pos[0] = '\0';
|
|
271
|
+
tokens[token_count++] = ++buffer_pos;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// find docs and paragraphs
|
|
275
|
+
int doc_count = 0;
|
|
276
|
+
int par_count = 0;
|
|
277
|
+
docs[doc_count++] = 0;
|
|
278
|
+
pars[par_count++] = 0;
|
|
279
|
+
int start_doc_next = 0;
|
|
280
|
+
int start_par_next = 0;
|
|
281
|
+
int i;
|
|
282
|
+
|
|
283
|
+
for (i = 1; i < token_count; i++) {
|
|
284
|
+
// "<doc>" or "<doc "
|
|
285
|
+
if (start_doc_next ||
|
|
286
|
+
(strncmp(tokens[i], doc_tag, doc_tag_len) == 0 &&
|
|
287
|
+
(tokens[i][doc_tag_len] == ' ' || tokens[i][doc_tag_len] == '>'))) {
|
|
288
|
+
docs[doc_count++] = par_count;
|
|
289
|
+
pars[par_count++] = i;
|
|
290
|
+
start_doc_next = 0;
|
|
291
|
+
}
|
|
292
|
+
// "</doc>"
|
|
293
|
+
else if (strncmp(tokens[i], doc_end_tag, doc_end_tag_len) == 0) {
|
|
294
|
+
start_doc_next = 1;
|
|
295
|
+
}
|
|
296
|
+
// "<p>" or "<p "
|
|
297
|
+
else if (start_par_next || (strncmp(tokens[i], par_tag, par_tag_len) == 0 &&
|
|
298
|
+
(tokens[i][par_tag_len] == ' ' ||
|
|
299
|
+
tokens[i][par_tag_len] == '>'))) {
|
|
300
|
+
pars[par_count++] = i;
|
|
301
|
+
start_par_next = 0;
|
|
302
|
+
}
|
|
303
|
+
// "</p>"
|
|
304
|
+
else if (strncmp(tokens[i], par_end_tag, par_end_tag_len) == 0) {
|
|
305
|
+
start_par_next = 1;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
if (doc_count == 1 && !feof(Input)) {
|
|
310
|
+
// full buffer contains only one document
|
|
311
|
+
// by default, documents with higher than 16MB are not checked for dedup
|
|
312
|
+
fprintf(stderr, "Too long document at byte %li.\n", total_processed_bytes);
|
|
313
|
+
UncheckedFilenames.push_back(fileNameIndex);
|
|
314
|
+
return 1;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
if (feof(Input)) {
|
|
318
|
+
// create sentinels
|
|
319
|
+
docs[doc_count++] = par_count;
|
|
320
|
+
if (strlen(tokens[token_count - 1]) == 0) {
|
|
321
|
+
// files ending with a newline have a natural sentinel
|
|
322
|
+
// (the last zero-length token)
|
|
323
|
+
pars[par_count++] = token_count - 1;
|
|
324
|
+
} else {
|
|
325
|
+
// for other files, we need to make up the last token
|
|
326
|
+
pars[par_count++] = token_count;
|
|
327
|
+
tokens[token_count++] = buffer + buffer_size;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// for all documents
|
|
332
|
+
int doc_i;
|
|
333
|
+
for (doc_i = 0; doc_i < doc_count - 1; doc_i++) {
|
|
334
|
+
buzhash_clear_buffer(&bh_buffer);
|
|
335
|
+
local.clear();
|
|
336
|
+
// for all paragraphs in the document
|
|
337
|
+
int par_i;
|
|
338
|
+
for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
|
|
339
|
+
int total_tokens = 0;
|
|
340
|
+
int bad_tokens = 0;
|
|
341
|
+
/* prev_bad_tokens is the number of tokens in the current
|
|
342
|
+
* n-gram which are contained in one of the previous bad
|
|
343
|
+
* n-grams.
|
|
344
|
+
*
|
|
345
|
+
* At the beginning of a new paragraph we need to pretend that
|
|
346
|
+
* there are Ngram_size prev_bad_tokens so that the leading
|
|
347
|
+
* bad n-grams do not generate too many bod tokens. */
|
|
348
|
+
int prev_bad_tokens = Ngram_size;
|
|
349
|
+
// for all tokens in the paragraph
|
|
350
|
+
// for every token in the current paragraph, it computes its hash and
|
|
351
|
+
// checks if it is present in the local and global hash maps
|
|
352
|
+
int tok_i;
|
|
353
|
+
for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
|
|
354
|
+
process_one_par(pars, tokens, bh_buffer, hash_bitmask, local, global,
|
|
355
|
+
bad_tokens, tok_i, par_i, total_tokens, prev_bad_tokens,
|
|
356
|
+
have_dupl_ngrams);
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// remember the length of the paragraph
|
|
360
|
+
par_len[par_i] = total_tokens;
|
|
361
|
+
|
|
362
|
+
// mark bad paragraphs
|
|
363
|
+
bad_par[par_i] =
|
|
364
|
+
(total_tokens > 0 && (1.0 * bad_tokens / total_tokens) > Dupl_thres);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// smoothing
|
|
368
|
+
if (!No_smoothing) {
|
|
369
|
+
int last_bad_par = docs[doc_i] - 1;
|
|
370
|
+
int stub_length = 0;
|
|
371
|
+
for (par_i = docs[doc_i]; par_i <= docs[doc_i + 1]; par_i++) {
|
|
372
|
+
if (par_i == docs[doc_i + 1] || bad_par[par_i]) {
|
|
373
|
+
if (stub_length <= Max_stub_length) {
|
|
374
|
+
// remove stub
|
|
375
|
+
int par_j;
|
|
376
|
+
for (par_j = last_bad_par + 1; par_j < par_i; par_j++)
|
|
377
|
+
bad_par[par_j] = 1;
|
|
378
|
+
}
|
|
379
|
+
last_bad_par = par_i;
|
|
380
|
+
stub_length = 0;
|
|
381
|
+
} else {
|
|
382
|
+
stub_length += par_len[par_i];
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
int count_bad = 0;
|
|
388
|
+
for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
|
|
389
|
+
if (!bad_par[par_i] && par_len[par_i] > 0) {
|
|
390
|
+
} else {
|
|
391
|
+
count_bad += 1;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
if (count_bad > 0) {
|
|
395
|
+
std::sort(foundFilenames.begin(), foundFilenames.end());
|
|
396
|
+
auto uniqueEnd =
|
|
397
|
+
std::unique(foundFilenames.begin(), foundFilenames.end());
|
|
398
|
+
foundFilenames.erase(uniqueEnd, foundFilenames.end());
|
|
399
|
+
printf("%s is %d bad\n", current_file_name, count_bad);
|
|
400
|
+
DuplicateFilenames.insert(DuplicateFilenames.end(),
|
|
401
|
+
foundFilenames.begin(), foundFilenames.end());
|
|
402
|
+
// DuplicateFilenames.push_back(current_file_name);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
foundFilenames.clear();
|
|
406
|
+
|
|
407
|
+
// is there at least one good paragraph?
|
|
408
|
+
int all_bad = 1;
|
|
409
|
+
for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
|
|
410
|
+
if (!bad_par[par_i] && par_len[par_i] > 0) {
|
|
411
|
+
all_bad = 0;
|
|
412
|
+
break;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
buzhash_clear_buffer(&bh_buffer);
|
|
417
|
+
// for all paragraphs in the document (again)
|
|
418
|
+
for (par_i = docs[doc_i]; par_i < docs[doc_i + 1]; par_i++) {
|
|
419
|
+
int first_token = pars[docs[doc_i]];
|
|
420
|
+
int last_token = pars[docs[doc_i + 1]] - 1;
|
|
421
|
+
// for all tokens in the paragraph
|
|
422
|
+
int tok_i;
|
|
423
|
+
for (tok_i = pars[par_i]; tok_i < pars[par_i + 1]; tok_i++) {
|
|
424
|
+
char *token = tokens[tok_i];
|
|
425
|
+
int bad_token = bad_par[par_i];
|
|
426
|
+
// tags at document boundaries are preserved if there is
|
|
427
|
+
// at least one good paragraph, removed otherwise
|
|
428
|
+
if (tok_i == first_token && strncmp(token, doc_tag, doc_tag_len) == 0)
|
|
429
|
+
bad_token = all_bad;
|
|
430
|
+
if (tok_i == last_token && strcmp(token, doc_end_tag) == 0)
|
|
431
|
+
bad_token = all_bad;
|
|
432
|
+
if (token[0] == '<')
|
|
433
|
+
continue;
|
|
434
|
+
// store hashes of n-grams
|
|
435
|
+
hash_t hash = buzhash(token, &bh_buffer);
|
|
436
|
+
hash_t masked_hash = hash & hash_bitmask;
|
|
437
|
+
if (!buzhash_is_full_buffer(&bh_buffer))
|
|
438
|
+
continue;
|
|
439
|
+
if (!bad_par[par_i]) {
|
|
440
|
+
if (have_dupl_ngrams) {
|
|
441
|
+
// If we have the list of hashes of all duplicate
|
|
442
|
+
// n-grams, we set the least significant bit of the
|
|
443
|
+
// stored hash to 1 if we have seen the matching
|
|
444
|
+
// duplicate n-gram to indicate it has been seen.
|
|
445
|
+
// Unique n-grams are ignored.
|
|
446
|
+
if (global.erase(masked_hash & BITMASK_HIGH63))
|
|
447
|
+
#ifdef GOOGLE_SPARSE
|
|
448
|
+
global.insert(std::make_pair(masked_hash | 1, fileNameIndex));
|
|
449
|
+
|
|
450
|
+
#else
|
|
451
|
+
global[masked_hash | 1] = fileNameIndex;
|
|
452
|
+
#endif
|
|
453
|
+
} else {
|
|
454
|
+
// otherwise we have to store hashes of all n-grams
|
|
455
|
+
#ifdef GOOGLE_SPARSE
|
|
456
|
+
global.insert(std::make_pair(masked_hash, fileNameIndex));
|
|
457
|
+
#else
|
|
458
|
+
global[masked_hash] = fileNameIndex;
|
|
459
|
+
#endif
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
// copy the unprocessed data to the beginning of the buffer
|
|
467
|
+
if (!feof(Input)) {
|
|
468
|
+
char *last_doc_fst_tok = tokens[pars[docs[doc_count - 1]]];
|
|
469
|
+
int processed_bytes = last_doc_fst_tok - buffer;
|
|
470
|
+
total_processed_bytes += processed_bytes;
|
|
471
|
+
int remaining_bytes = buffer_size - processed_bytes;
|
|
472
|
+
char *buffer_end = last_doc_fst_tok + remaining_bytes;
|
|
473
|
+
char *pos;
|
|
474
|
+
// replace \0s with EOLs (revert the buffer contents to original)
|
|
475
|
+
for (pos = last_doc_fst_tok; pos < buffer_end; pos++)
|
|
476
|
+
if (pos[0] == '\0')
|
|
477
|
+
pos[0] = '\n';
|
|
478
|
+
// print progress information
|
|
479
|
+
if (!Quiet) {
|
|
480
|
+
float percent_done = -1;
|
|
481
|
+
if (Input_size > 0)
|
|
482
|
+
percent_done = 100.0 * total_processed_bytes / Input_size;
|
|
483
|
+
print_progress("removing duplicates", total_processed_bytes,
|
|
484
|
+
percent_done);
|
|
485
|
+
}
|
|
486
|
+
memmove(buffer, last_doc_fst_tok, remaining_bytes);
|
|
487
|
+
buffer_content = remaining_bytes;
|
|
488
|
+
}
|
|
489
|
+
printf("end of process_one_file\n");
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
int main(int argc, char **argv) {
|
|
493
|
+
printf("Dupl Threshold %f \n", Dupl_thres);
|
|
494
|
+
printf("N-gram size %d \n", Ngram_size);
|
|
495
|
+
// get options
|
|
496
|
+
int c;
|
|
497
|
+
char *endptr;
|
|
498
|
+
char *datasetname = NULL;
|
|
499
|
+
char *listOfFilesPath = NULL;
|
|
500
|
+
char *output_dir = NULL;
|
|
501
|
+
|
|
502
|
+
while ((c = getopt(argc, argv, "f:n:t:d:p:smT:l:b:qVhD:L:O:")) != -1) {
|
|
503
|
+
errno = 0;
|
|
504
|
+
switch (c) {
|
|
505
|
+
case 'f':
|
|
506
|
+
Dupl_hashes_path = optarg;
|
|
507
|
+
break;
|
|
508
|
+
case 'n':
|
|
509
|
+
Ngram_size = strtol(optarg, &endptr, 10);
|
|
510
|
+
if (errno != 0 || *endptr != '\0') {
|
|
511
|
+
fprintf(stderr, "Integer value expected for -n, got: %s\n", optarg);
|
|
512
|
+
print_usage(stderr);
|
|
513
|
+
return 1;
|
|
514
|
+
}
|
|
515
|
+
break;
|
|
516
|
+
case 't':
|
|
517
|
+
Dupl_thres = strtod(optarg, &endptr);
|
|
518
|
+
if (errno != 0 || *endptr != '\0') {
|
|
519
|
+
fprintf(stderr, "Float value expected for -t, got: %s\n", optarg);
|
|
520
|
+
print_usage(stderr);
|
|
521
|
+
return 1;
|
|
522
|
+
}
|
|
523
|
+
break;
|
|
524
|
+
case 'd':
|
|
525
|
+
Doc_tag = optarg;
|
|
526
|
+
break;
|
|
527
|
+
case 'p':
|
|
528
|
+
Par_tag = optarg;
|
|
529
|
+
break;
|
|
530
|
+
case 's':
|
|
531
|
+
Strip_dupl = 1;
|
|
532
|
+
break;
|
|
533
|
+
case 'm':
|
|
534
|
+
No_smoothing = 1;
|
|
535
|
+
break;
|
|
536
|
+
case 'T':
|
|
537
|
+
Trim_hashes = strtol(optarg, &endptr, 10);
|
|
538
|
+
if (errno != 0 || *endptr != '\0') {
|
|
539
|
+
fprintf(stderr, "Integer value expected for -T, got: %s\n", optarg);
|
|
540
|
+
print_usage(stderr);
|
|
541
|
+
return 1;
|
|
542
|
+
}
|
|
543
|
+
break;
|
|
544
|
+
case 'l':
|
|
545
|
+
Max_stub_length = strtol(optarg, &endptr, 10);
|
|
546
|
+
if (errno != 0 || *endptr != '\0') {
|
|
547
|
+
fprintf(stderr, "Integer value expected for -l, got: %s\n", optarg);
|
|
548
|
+
print_usage(stderr);
|
|
549
|
+
return 1;
|
|
550
|
+
}
|
|
551
|
+
break;
|
|
552
|
+
case 'b':
|
|
553
|
+
Buffer_size = strtol(optarg, &endptr, 10);
|
|
554
|
+
if (errno != 0 || *endptr != '\0') {
|
|
555
|
+
fprintf(stderr, "Integer value expected for -b, got: %s\n", optarg);
|
|
556
|
+
print_usage(stderr);
|
|
557
|
+
return 1;
|
|
558
|
+
}
|
|
559
|
+
break;
|
|
560
|
+
case 'q':
|
|
561
|
+
Quiet = 1;
|
|
562
|
+
break;
|
|
563
|
+
case 'V':
|
|
564
|
+
print_version("onion");
|
|
565
|
+
return 0;
|
|
566
|
+
case 'h':
|
|
567
|
+
print_usage(stdout);
|
|
568
|
+
return 0;
|
|
569
|
+
case '?':
|
|
570
|
+
print_usage(stderr);
|
|
571
|
+
return 1;
|
|
572
|
+
|
|
573
|
+
case 'D':
|
|
574
|
+
datasetname = optarg;
|
|
575
|
+
break;
|
|
576
|
+
case 'L':
|
|
577
|
+
listOfFilesPath = optarg;
|
|
578
|
+
break;
|
|
579
|
+
case 'O':
|
|
580
|
+
output_dir = optarg;
|
|
581
|
+
break;
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
if (output_dir == NULL || datasetname == NULL || listOfFilesPath == NULL) {
|
|
586
|
+
std::cerr << "Missing required parameters.\n";
|
|
587
|
+
return 1;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
FILE *List_of_Files = fopen(listOfFilesPath, "r");
|
|
591
|
+
if (List_of_Files == NULL) {
|
|
592
|
+
perror("Unable to open list of files");
|
|
593
|
+
return 1;
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
struct stat st = {0};
|
|
597
|
+
if (stat(output_dir, &st) == -1) {
|
|
598
|
+
if (mkdir(output_dir, 0700) != 0) {
|
|
599
|
+
perror("Failed to create output directory");
|
|
600
|
+
return 1;
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
Input = stdin;
|
|
605
|
+
Input_size = -1;
|
|
606
|
+
if (optind < argc) {
|
|
607
|
+
char *filename = argv[optind];
|
|
608
|
+
if (strcmp(filename, "-") != 0) {
|
|
609
|
+
errno = 0;
|
|
610
|
+
Input = fopen(filename, "r");
|
|
611
|
+
if (errno != 0) {
|
|
612
|
+
fprintf(stderr, "Unable to open %s for reading.\n", filename);
|
|
613
|
+
return 1;
|
|
614
|
+
}
|
|
615
|
+
fseek(Input, 0L, SEEK_END);
|
|
616
|
+
Input_size = ftell(Input);
|
|
617
|
+
fseek(Input, 0L, SEEK_SET);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
unsigned long int total_processed_bytes = 0;
|
|
622
|
+
|
|
623
|
+
// patterns
|
|
624
|
+
char *doc_tag = (char *)malloc((strlen(Doc_tag) + 1 + 1) * sizeof(char));
|
|
625
|
+
strcat(strcpy(doc_tag, "<"), Doc_tag);
|
|
626
|
+
char *doc_end_tag = (char *)malloc((strlen(Doc_tag) + 3 + 1) * sizeof(char));
|
|
627
|
+
strcat(strcat(strcpy(doc_end_tag, "</"), Doc_tag), ">");
|
|
628
|
+
char *par_tag = (char *)malloc((strlen(Par_tag) + 1 + 1) * sizeof(char));
|
|
629
|
+
strcat(strcpy(par_tag, "<"), Par_tag);
|
|
630
|
+
char *par_end_tag = (char *)malloc((strlen(Par_tag) + 3 + 1) * sizeof(char));
|
|
631
|
+
strcat(strcat(strcpy(par_end_tag, "</"), Par_tag), ">");
|
|
632
|
+
|
|
633
|
+
int doc_tag_len = strlen(doc_tag);
|
|
634
|
+
int doc_end_tag_len = strlen(doc_end_tag);
|
|
635
|
+
int par_tag_len = strlen(par_tag);
|
|
636
|
+
int par_end_tag_len = strlen(par_end_tag);
|
|
637
|
+
|
|
638
|
+
// bitmask for trimming ngram hashes
|
|
639
|
+
hash_t hash_bitmask = 0xfffffffffffffffful;
|
|
640
|
+
int bitshift = 64 - Trim_hashes;
|
|
641
|
+
if (bitshift > 0)
|
|
642
|
+
hash_bitmask >>= bitshift;
|
|
643
|
+
|
|
644
|
+
// data structures
|
|
645
|
+
int buffer_size = 0;
|
|
646
|
+
int buffer_content = 0;
|
|
647
|
+
char *buffer = (char *)malloc((Buffer_size + 1) * sizeof(char));
|
|
648
|
+
char **tokens = (char **)malloc((Buffer_size + 1) * sizeof(char *));
|
|
649
|
+
int *pars = (int *)malloc((Buffer_size + 1) *
|
|
650
|
+
sizeof(int)); // array of starting tokens
|
|
651
|
+
int *par_len = (int *)malloc((Buffer_size + 1) * sizeof(int));
|
|
652
|
+
char *bad_par = (char *)malloc((Buffer_size + 1) * sizeof(char));
|
|
653
|
+
int *docs =
|
|
654
|
+
(int *)malloc((Buffer_size + 1) * sizeof(int)); // array of starting pars
|
|
655
|
+
int token_count, par_count, doc_count;
|
|
656
|
+
|
|
657
|
+
// buzhash
|
|
658
|
+
buzhash_buffer_t bh_buffer;
|
|
659
|
+
buzhash_init_buffer(&bh_buffer, Ngram_size);
|
|
660
|
+
|
|
661
|
+
// global hash table stores the hashes of all files read so far (in our case
|
|
662
|
+
// the full document since we did not mark paragraph separators) local hash
|
|
663
|
+
// table stores the hahses of n-grams found within the currently processed
|
|
664
|
+
// file (doc as a whole in our case)
|
|
665
|
+
ngrhash global, local;
|
|
666
|
+
#ifdef GOOGLE_SPARSE
|
|
667
|
+
global.set_deleted_key(0);
|
|
668
|
+
local.set_deleted_key(0);
|
|
669
|
+
#endif
|
|
670
|
+
|
|
671
|
+
// read hashes of duplicate n-grams if available
|
|
672
|
+
int have_dupl_ngrams = 0;
|
|
673
|
+
if (Dupl_hashes_path != NULL) {
|
|
674
|
+
have_dupl_ngrams = 1;
|
|
675
|
+
errno = 0;
|
|
676
|
+
FILE *ngrams_fp = fopen(Dupl_hashes_path, "r");
|
|
677
|
+
if (errno != 0) {
|
|
678
|
+
fprintf(stderr, "Unable to open %s for reading.\n", Dupl_hashes_path);
|
|
679
|
+
return 1;
|
|
680
|
+
}
|
|
681
|
+
fseek(ngrams_fp, 0L, SEEK_END);
|
|
682
|
+
unsigned long int ngrams_size = ftell(ngrams_fp);
|
|
683
|
+
fseek(ngrams_fp, 0L, SEEK_SET);
|
|
684
|
+
|
|
685
|
+
unsigned long int bytes_read = 0;
|
|
686
|
+
hash_t hash;
|
|
687
|
+
while (fread(&hash, sizeof(hash), 1, ngrams_fp)) {
|
|
688
|
+
printf("reading");
|
|
689
|
+
bytes_read += sizeof(hash);
|
|
690
|
+
hash_t masked_hash = hash & hash_bitmask;
|
|
691
|
+
// store only the 63 most significant bits of the hash;
|
|
692
|
+
// reserve the last bit as a flag (seen / unseen)
|
|
693
|
+
// #ifdef GOOGLE_SPARSE
|
|
694
|
+
// global.insert(std::make_pair(masked_hash | 1,
|
|
695
|
+
// fileNameIndex));//global.insert (masked_hash & BITMASK_HIGH63); #else
|
|
696
|
+
// global[masked_hash & BITMASK_HIGH63] = true;
|
|
697
|
+
// #endif
|
|
698
|
+
|
|
699
|
+
// print progress information
|
|
700
|
+
if (!Quiet && bytes_read % (10000000 * sizeof(hash)) == 0) {
|
|
701
|
+
float percent_done = -1;
|
|
702
|
+
if (ngrams_size > 0)
|
|
703
|
+
percent_done = 100.0 * bytes_read / ngrams_size;
|
|
704
|
+
print_progress("reading hashes", bytes_read, percent_done);
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
if (!Quiet)
|
|
709
|
+
print_progress("reading hashes", bytes_read, 100);
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
int fileIndex =
|
|
713
|
+
0; // File index t store every 1000 duplicates in a new csv file
|
|
714
|
+
|
|
715
|
+
while (!feof(List_of_Files)) {
|
|
716
|
+
fileNameIndex++;
|
|
717
|
+
/* read the name of each file in the list of files*/
|
|
718
|
+
if (fgets(current_file_name, sizeof(current_file_name), List_of_Files) ==
|
|
719
|
+
NULL) {
|
|
720
|
+
break;
|
|
721
|
+
}
|
|
722
|
+
int last_ch = current_file_name[strlen(current_file_name) - 1];
|
|
723
|
+
if (last_ch == '\n') {
|
|
724
|
+
current_file_name[strlen(current_file_name) - 1] = '\0';
|
|
725
|
+
}
|
|
726
|
+
/* use fopen to assign Input to it*/
|
|
727
|
+
Input = fopen(current_file_name, "r");
|
|
728
|
+
if (Input == NULL) {
|
|
729
|
+
char err_msg[1024];
|
|
730
|
+
snprintf(err_msg, 1023, "Error to open data file %s--",
|
|
731
|
+
current_file_name);
|
|
732
|
+
perror(err_msg);
|
|
733
|
+
continue;
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
printf("File Number: %d \n", fileNameIndex);
|
|
737
|
+
|
|
738
|
+
/* Process it as below*/
|
|
739
|
+
// it modifies the data structures passed to it as arguments, processes the
|
|
740
|
+
// input file, and updates global state
|
|
741
|
+
process_one_file(buffer_content, /* int */
|
|
742
|
+
tokens, /* char** */
|
|
743
|
+
pars, /* int* */
|
|
744
|
+
par_len, /* int* */
|
|
745
|
+
bad_par, /* char* */
|
|
746
|
+
|
|
747
|
+
docs, /* int* */
|
|
748
|
+
|
|
749
|
+
doc_tag, /* char* */
|
|
750
|
+
doc_tag_len, /* int */
|
|
751
|
+
doc_end_tag, /* char* */
|
|
752
|
+
doc_end_tag_len, /* int */
|
|
753
|
+
par_tag, /* char* */
|
|
754
|
+
par_tag_len, /* int */
|
|
755
|
+
par_end_tag, /* char* */
|
|
756
|
+
par_end_tag_len, /* int */
|
|
757
|
+
|
|
758
|
+
have_dupl_ngrams, /* long int int */
|
|
759
|
+
total_processed_bytes, /* unsigned */
|
|
760
|
+
// make sure these become mutable
|
|
761
|
+
global, /* ngrhash */
|
|
762
|
+
local, /* ngrhash */
|
|
763
|
+
bh_buffer, /* buzhash_buffer_t */
|
|
764
|
+
buffer /* char* */
|
|
765
|
+
);
|
|
766
|
+
|
|
767
|
+
if (DuplicateFilenames.size() > 1000) {
|
|
768
|
+
fileIndex++;
|
|
769
|
+
std::string filename = std::string(output_dir) + "Duplicate_pair_files_" +
|
|
770
|
+
std::string(datasetname) + "_" +
|
|
771
|
+
to_string(fileIndex) + ".csv";
|
|
772
|
+
writeFilenameDuplicateToCSV(DuplicateFilenames, filename);
|
|
773
|
+
DuplicateFilenames.clear();
|
|
774
|
+
printf(" 1000 Duplicated files are saved\n");
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
fclose(Input);
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
fileIndex++;
|
|
781
|
+
std::string filename = std::string(output_dir) + "Duplicate_pair_files_" +
|
|
782
|
+
std::string(datasetname) + "_" + to_string(fileIndex) +
|
|
783
|
+
".csv";
|
|
784
|
+
writeFilenameDuplicateToCSV(DuplicateFilenames, filename);
|
|
785
|
+
std::string name = std::string(output_dir) + "Unchecked_files_" +
|
|
786
|
+
std::string(datasetname) + "_" + ".csv";
|
|
787
|
+
writeUncheckedFilenamesToCSV(UncheckedFilenames, name);
|
|
788
|
+
|
|
789
|
+
// print progress information
|
|
790
|
+
total_processed_bytes += buffer_size;
|
|
791
|
+
if (!Quiet)
|
|
792
|
+
print_progress("removing duplicates", total_processed_bytes, 100);
|
|
793
|
+
|
|
794
|
+
// save the global hash map
|
|
795
|
+
// saveGlobalHashmap(global); // This will append to the existing file or
|
|
796
|
+
// create a new one if not present
|
|
797
|
+
|
|
798
|
+
return 0;
|
|
799
|
+
}
|