dalla-data-processing 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dalla/__init__.py +27 -0
- dalla/cli.py +453 -0
- dalla/core/__init__.py +6 -0
- dalla/core/dataset.py +387 -0
- dalla/core/parallel.py +279 -0
- dalla/deduplication/__init__.py +370 -0
- dalla/deduplication/bin/.gitignore +1 -0
- dalla/deduplication/bin/onion-linux-x86_64 +0 -0
- dalla/deduplication/onion/COPYING +24 -0
- dalla/deduplication/onion/Makefile +21 -0
- dalla/deduplication/onion/Makefile.config +3 -0
- dalla/deduplication/onion/README.md +21 -0
- dalla/deduplication/onion/src/Makefile +22 -0
- dalla/deduplication/onion/src/Makefile.g +23 -0
- dalla/deduplication/onion/src/buzhash.c +325 -0
- dalla/deduplication/onion/src/buzhash.h +30 -0
- dalla/deduplication/onion/src/hashdup.c +172 -0
- dalla/deduplication/onion/src/hashgen.c +206 -0
- dalla/deduplication/onion/src/onion +0 -0
- dalla/deduplication/onion/src/onion.c +799 -0
- dalla/deduplication/onion/src/onion_dup.c +824 -0
- dalla/deduplication/onion/src/version.c +17 -0
- dalla/deduplication/onion/src/version.h +10 -0
- dalla/deduplication/onion/src_sc/Makefile +22 -0
- dalla/deduplication/onion/src_sc/Makefile.g +23 -0
- dalla/deduplication/onion/src_sc/buzhash.c +325 -0
- dalla/deduplication/onion/src_sc/buzhash.h +30 -0
- dalla/deduplication/onion/src_sc/hashdup +0 -0
- dalla/deduplication/onion/src_sc/hashdup.c +172 -0
- dalla/deduplication/onion/src_sc/hashgen +0 -0
- dalla/deduplication/onion/src_sc/hashgen.c +206 -0
- dalla/deduplication/onion/src_sc/onion.c +854 -0
- dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
- dalla/deduplication/onion/src_sc/version.c +17 -0
- dalla/deduplication/onion/src_sc/version.h +10 -0
- dalla/deduplication/onion_wrapper.py +223 -0
- dalla/deduplication/postprocessing.py +216 -0
- dalla/deduplication/preprocessing.py +120 -0
- dalla/quality/__init__.py +5 -0
- dalla/quality/checker.py +354 -0
- dalla/readability/__init__.py +197 -0
- dalla/readability/ranking.py +165 -0
- dalla/readability/scorer.py +148 -0
- dalla/stemming/__init__.py +551 -0
- dalla/stemming/data/words_al.txt +3414 -0
- dalla/stemming/data/words_al_t.txt +885 -0
- dalla/stemming/data/words_t.txt +7 -0
- dalla/utils/__init__.py +10 -0
- dalla/utils/logger.py +128 -0
- dalla/utils/tokenize.py +89 -0
- dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
- dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
- dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
- dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
- dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2015 Jan Pomikalek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************/
|
|
8
|
+
|
|
9
|
+
#include <errno.h>
|
|
10
|
+
#include <stdio.h>
|
|
11
|
+
#include <stdlib.h>
|
|
12
|
+
#include <string.h>
|
|
13
|
+
#include <time.h>
|
|
14
|
+
#include <unistd.h>
|
|
15
|
+
#include "buzhash.h"
|
|
16
|
+
#include "version.h"
|
|
17
|
+
|
|
18
|
+
#define MAX_LINE_LENGTH 10000
|
|
19
|
+
#define NGRAM_SIZE 5
|
|
20
|
+
#define OUTPUT_PREFIX "hashes."
|
|
21
|
+
#define OUTPUT_COUNT 10
|
|
22
|
+
|
|
23
|
+
// options
|
|
24
|
+
int Ngram_size = NGRAM_SIZE;
|
|
25
|
+
char *Output_prefix = OUTPUT_PREFIX;
|
|
26
|
+
int Output_count = OUTPUT_COUNT;
|
|
27
|
+
int Quiet = 0;
|
|
28
|
+
FILE* Input;
|
|
29
|
+
long int Input_size;
|
|
30
|
+
|
|
31
|
+
void print_usage(FILE *stream) {
|
|
32
|
+
fprintf(stream, "\
|
|
33
|
+
Usage: hashgen [OPTIONS] [FILE]\n\
|
|
34
|
+
Generate hashes of n-grams.\n\
|
|
35
|
+
\n\
|
|
36
|
+
-n NUM n-gram length (default: %i)\n\
|
|
37
|
+
-o STR prefix of output files (default: %s)\n\
|
|
38
|
+
-c NUM number of output files (default: %i)\n\
|
|
39
|
+
-q quiet; suppress all output except for errors\n\
|
|
40
|
+
\n\
|
|
41
|
+
-V print version information and exit\n\
|
|
42
|
+
-h display this help and exit\n\
|
|
43
|
+
\n\
|
|
44
|
+
With no FILE, or when FILE is -, read standard input.\n\
|
|
45
|
+
\n\
|
|
46
|
+
Project home page: <http://code.google.com/p/onion/>\n",
|
|
47
|
+
NGRAM_SIZE, OUTPUT_PREFIX, OUTPUT_COUNT);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
void print_progress(unsigned long int processed_bytes, float percent_done) {
|
|
51
|
+
time_t now;
|
|
52
|
+
time(&now);
|
|
53
|
+
fprintf(stderr, "[%.24s] hashgen: %6li MB processed", ctime(&now),
|
|
54
|
+
processed_bytes / (1024 * 1024));
|
|
55
|
+
if (percent_done >= 0)
|
|
56
|
+
fprintf(stderr, " (%6.2f%%)", percent_done);
|
|
57
|
+
fprintf(stderr, "\n");
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
int main(int argc, char **argv) {
|
|
61
|
+
// get options
|
|
62
|
+
int c;
|
|
63
|
+
char *endptr;
|
|
64
|
+
while ((c = getopt(argc, argv, "n:o:c:qVh")) != -1) {
|
|
65
|
+
errno = 0;
|
|
66
|
+
switch (c) {
|
|
67
|
+
case 'n':
|
|
68
|
+
Ngram_size = strtol(optarg, &endptr, 10);
|
|
69
|
+
if (errno != 0 || *endptr != '\0') {
|
|
70
|
+
fprintf(stderr, "Integer value expected for -n, got: %s\n", optarg);
|
|
71
|
+
print_usage(stderr);
|
|
72
|
+
return 1;
|
|
73
|
+
}
|
|
74
|
+
break;
|
|
75
|
+
case 'o':
|
|
76
|
+
Output_prefix = optarg;
|
|
77
|
+
break;
|
|
78
|
+
case 'c':
|
|
79
|
+
Output_count = strtol(optarg, &endptr, 10);
|
|
80
|
+
if (errno != 0 || *endptr != '\0') {
|
|
81
|
+
fprintf(stderr, "Integer value expected for -c, got: %s\n", optarg);
|
|
82
|
+
print_usage(stderr);
|
|
83
|
+
return 1;
|
|
84
|
+
}
|
|
85
|
+
break;
|
|
86
|
+
case 'q':
|
|
87
|
+
Quiet = 1;
|
|
88
|
+
break;
|
|
89
|
+
case 'V':
|
|
90
|
+
print_version("hashgen");
|
|
91
|
+
return 0;
|
|
92
|
+
case 'h':
|
|
93
|
+
print_usage(stdout);
|
|
94
|
+
return 0;
|
|
95
|
+
case '?':
|
|
96
|
+
print_usage(stderr);
|
|
97
|
+
return 1;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
Input = stdin;
|
|
102
|
+
Input_size = -1;
|
|
103
|
+
if (optind < argc) {
|
|
104
|
+
char* filename = argv[optind];
|
|
105
|
+
if (strcmp(filename, "-") != 0) {
|
|
106
|
+
errno = 0;
|
|
107
|
+
Input = fopen(filename, "r");
|
|
108
|
+
if (errno != 0) {
|
|
109
|
+
fprintf(stderr, "Unable to open %s for reading.\n", filename);
|
|
110
|
+
return 1;
|
|
111
|
+
}
|
|
112
|
+
fseek(Input, 0L, SEEK_END);
|
|
113
|
+
Input_size = ftell(Input);
|
|
114
|
+
fseek(Input, 0L, SEEK_SET);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// output files
|
|
119
|
+
FILE** output_files = (FILE**) malloc(Output_count * sizeof(FILE*));
|
|
120
|
+
char* filename = (char*) malloc(
|
|
121
|
+
(strlen(Output_prefix) + (Output_count/10+1) + 1) * sizeof(char));
|
|
122
|
+
int i;
|
|
123
|
+
for (i=0; i<Output_count; i++) {
|
|
124
|
+
sprintf(filename, "%s%i", Output_prefix, i);
|
|
125
|
+
errno = 0;
|
|
126
|
+
output_files[i] = fopen(filename, "w");
|
|
127
|
+
if (errno != 0) {
|
|
128
|
+
fprintf(stderr, "Unable to open %s for writing.\n", filename);
|
|
129
|
+
return 1;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
free(filename);
|
|
133
|
+
|
|
134
|
+
// hash range boundaries
|
|
135
|
+
hash_t* range_boundaries = (hash_t*) malloc(Output_count * sizeof(hash_t));
|
|
136
|
+
hash_t range_size = BUZHASH_MAX / Output_count;
|
|
137
|
+
for (i=0; i<Output_count-1; i++)
|
|
138
|
+
range_boundaries[i] = (i+1) * range_size;
|
|
139
|
+
range_boundaries[Output_count-1] = BUZHASH_MAX;
|
|
140
|
+
|
|
141
|
+
// buzhash
|
|
142
|
+
hash_t hash;
|
|
143
|
+
buzhash_buffer_t bh_buffer;
|
|
144
|
+
buzhash_init_buffer(&bh_buffer, Ngram_size);
|
|
145
|
+
|
|
146
|
+
// other variables
|
|
147
|
+
char line[MAX_LINE_LENGTH];
|
|
148
|
+
unsigned long int line_number = 0;
|
|
149
|
+
unsigned long int processed_bytes = 0;
|
|
150
|
+
|
|
151
|
+
while (fgets(line, MAX_LINE_LENGTH, Input)) {
|
|
152
|
+
// read line and strip trailing newline
|
|
153
|
+
line_number++;
|
|
154
|
+
int linelen = strlen(line);
|
|
155
|
+
char* newline_pointer = strchr(line, '\n');
|
|
156
|
+
if (newline_pointer == NULL) {
|
|
157
|
+
if (linelen >= MAX_LINE_LENGTH - 1)
|
|
158
|
+
fprintf(stderr, "Warning: line %li too long; "
|
|
159
|
+
"processing only first %i chars.\n", line_number,
|
|
160
|
+
linelen);
|
|
161
|
+
else
|
|
162
|
+
fprintf(stderr, "Warning: line %li contains a NUL character; "
|
|
163
|
+
"processing only the first %i chars.\n", line_number,
|
|
164
|
+
linelen);
|
|
165
|
+
}
|
|
166
|
+
else {
|
|
167
|
+
*newline_pointer = '\0';
|
|
168
|
+
}
|
|
169
|
+
processed_bytes+= linelen;
|
|
170
|
+
|
|
171
|
+
// skip lines starting with <
|
|
172
|
+
if (line[0] == '<')
|
|
173
|
+
continue;
|
|
174
|
+
|
|
175
|
+
// compute hash
|
|
176
|
+
hash = buzhash(line, &bh_buffer);
|
|
177
|
+
if (!buzhash_is_full_buffer(&bh_buffer))
|
|
178
|
+
continue;
|
|
179
|
+
|
|
180
|
+
// store hash in the correct file
|
|
181
|
+
int range_index = 0;
|
|
182
|
+
while (hash > range_boundaries[range_index])
|
|
183
|
+
range_index++;
|
|
184
|
+
fwrite(&hash, sizeof(hash), 1, output_files[range_index]);
|
|
185
|
+
|
|
186
|
+
// print progress information
|
|
187
|
+
if (!Quiet && line_number % 10000000 == 0) {
|
|
188
|
+
float percent_done = -1;
|
|
189
|
+
if (Input_size > 0)
|
|
190
|
+
percent_done = 100.0 * processed_bytes / Input_size;
|
|
191
|
+
print_progress(processed_bytes, percent_done);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// print progress information
|
|
196
|
+
if (!Quiet)
|
|
197
|
+
print_progress(processed_bytes, 100);
|
|
198
|
+
|
|
199
|
+
for (i=0; i<Output_count; i++)
|
|
200
|
+
fclose(output_files[i]);
|
|
201
|
+
|
|
202
|
+
if (Input != stdin)
|
|
203
|
+
fclose(Input);
|
|
204
|
+
|
|
205
|
+
return 0;
|
|
206
|
+
}
|
|
Binary file
|