dalla-data-processing 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dalla/__init__.py +27 -0
- dalla/cli.py +453 -0
- dalla/core/__init__.py +6 -0
- dalla/core/dataset.py +387 -0
- dalla/core/parallel.py +279 -0
- dalla/deduplication/__init__.py +370 -0
- dalla/deduplication/bin/.gitignore +1 -0
- dalla/deduplication/bin/onion-linux-x86_64 +0 -0
- dalla/deduplication/onion/COPYING +24 -0
- dalla/deduplication/onion/Makefile +21 -0
- dalla/deduplication/onion/Makefile.config +3 -0
- dalla/deduplication/onion/README.md +21 -0
- dalla/deduplication/onion/src/Makefile +22 -0
- dalla/deduplication/onion/src/Makefile.g +23 -0
- dalla/deduplication/onion/src/buzhash.c +325 -0
- dalla/deduplication/onion/src/buzhash.h +30 -0
- dalla/deduplication/onion/src/hashdup.c +172 -0
- dalla/deduplication/onion/src/hashgen.c +206 -0
- dalla/deduplication/onion/src/onion +0 -0
- dalla/deduplication/onion/src/onion.c +799 -0
- dalla/deduplication/onion/src/onion_dup.c +824 -0
- dalla/deduplication/onion/src/version.c +17 -0
- dalla/deduplication/onion/src/version.h +10 -0
- dalla/deduplication/onion/src_sc/Makefile +22 -0
- dalla/deduplication/onion/src_sc/Makefile.g +23 -0
- dalla/deduplication/onion/src_sc/buzhash.c +325 -0
- dalla/deduplication/onion/src_sc/buzhash.h +30 -0
- dalla/deduplication/onion/src_sc/hashdup +0 -0
- dalla/deduplication/onion/src_sc/hashdup.c +172 -0
- dalla/deduplication/onion/src_sc/hashgen +0 -0
- dalla/deduplication/onion/src_sc/hashgen.c +206 -0
- dalla/deduplication/onion/src_sc/onion.c +854 -0
- dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
- dalla/deduplication/onion/src_sc/version.c +17 -0
- dalla/deduplication/onion/src_sc/version.h +10 -0
- dalla/deduplication/onion_wrapper.py +223 -0
- dalla/deduplication/postprocessing.py +216 -0
- dalla/deduplication/preprocessing.py +120 -0
- dalla/quality/__init__.py +5 -0
- dalla/quality/checker.py +354 -0
- dalla/readability/__init__.py +197 -0
- dalla/readability/ranking.py +165 -0
- dalla/readability/scorer.py +148 -0
- dalla/stemming/__init__.py +551 -0
- dalla/stemming/data/words_al.txt +3414 -0
- dalla/stemming/data/words_al_t.txt +885 -0
- dalla/stemming/data/words_t.txt +7 -0
- dalla/utils/__init__.py +10 -0
- dalla/utils/logger.py +128 -0
- dalla/utils/tokenize.py +89 -0
- dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
- dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
- dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
- dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
- dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2016 Jan Pomikalek, Milos Jakubicek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************/
|
|
8
|
+
|
|
9
|
+
#include <stdio.h>
|
|
10
|
+
#include <stdlib.h>
|
|
11
|
+
#include <string.h>
|
|
12
|
+
#include "version.h"
|
|
13
|
+
|
|
14
|
+
void print_version(const char* progname) {
|
|
15
|
+
printf("%s: onion v%s\n\n", progname, VERSION);
|
|
16
|
+
printf("Copyright (c) 2011-2020 Lexical Computing Limited and Lexical Computing CZ s.r.o.\n");
|
|
17
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/*********************************************************************
|
|
2
|
+
* Copyright (c) 2011-2016 Jan Pomikalek, Milos Jakubicek *
|
|
3
|
+
* All rights reserved. *
|
|
4
|
+
* *
|
|
5
|
+
* This software is licensed as described in the file COPYING, which *
|
|
6
|
+
* you should have received as part of this distribution. *
|
|
7
|
+
*********************************************************************/
|
|
8
|
+
|
|
9
|
+
#define VERSION "1.4"
|
|
10
|
+
void print_version(const char* progname);
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wrapper for the onion C binary deduplication tool.
|
|
3
|
+
Handles execution of the onion binary and management of temporary files.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from dalla.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def find_onion_binary() -> Path | None:
|
|
16
|
+
"""
|
|
17
|
+
Find the onion binary in the system.
|
|
18
|
+
|
|
19
|
+
Searches in:
|
|
20
|
+
1. System PATH
|
|
21
|
+
2. Bundled platform-specific binary in package
|
|
22
|
+
3. Generic bundled binary (symlink)
|
|
23
|
+
4. Local build directories
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Path to onion binary or None if not found
|
|
27
|
+
"""
|
|
28
|
+
import platform
|
|
29
|
+
|
|
30
|
+
onion_path = shutil.which("onion")
|
|
31
|
+
if onion_path:
|
|
32
|
+
logger.info(f"Found onion in system PATH: {onion_path}")
|
|
33
|
+
return Path(onion_path)
|
|
34
|
+
|
|
35
|
+
package_dir = Path(__file__).parent
|
|
36
|
+
bin_dir = package_dir / "bin"
|
|
37
|
+
|
|
38
|
+
system = platform.system().lower()
|
|
39
|
+
machine = platform.machine().lower()
|
|
40
|
+
|
|
41
|
+
if system == "linux":
|
|
42
|
+
platform_name = "linux"
|
|
43
|
+
elif system == "darwin":
|
|
44
|
+
platform_name = "darwin"
|
|
45
|
+
elif system == "windows":
|
|
46
|
+
platform_name = "windows"
|
|
47
|
+
else:
|
|
48
|
+
platform_name = system
|
|
49
|
+
|
|
50
|
+
platform_binary = bin_dir / f"onion-{platform_name}-{machine}"
|
|
51
|
+
if platform_binary.exists() and platform_binary.is_file():
|
|
52
|
+
logger.info(f"Found bundled platform-specific binary: {platform_binary}")
|
|
53
|
+
return platform_binary
|
|
54
|
+
|
|
55
|
+
generic_binary = bin_dir / "onion"
|
|
56
|
+
if generic_binary.exists() and generic_binary.is_file():
|
|
57
|
+
logger.info(f"Found bundled onion binary: {generic_binary}")
|
|
58
|
+
return generic_binary
|
|
59
|
+
|
|
60
|
+
Path(__file__).parents[2]
|
|
61
|
+
local_builds = [
|
|
62
|
+
package_dir / "onion" / "src_sc" / "onion",
|
|
63
|
+
package_dir / "onion" / "src" / "onion",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
for local_bin in local_builds:
|
|
67
|
+
if local_bin.exists() and local_bin.is_file():
|
|
68
|
+
logger.info(f"Found local onion binary: {local_bin}")
|
|
69
|
+
return local_bin
|
|
70
|
+
|
|
71
|
+
logger.warning("Onion binary not found in PATH or package")
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def run_onion(
|
|
76
|
+
file_list_path: Path,
|
|
77
|
+
output_dir: Path,
|
|
78
|
+
dataset_name: str = "dataset",
|
|
79
|
+
onion_binary: Path | None = None,
|
|
80
|
+
) -> tuple[bool, Path | None]:
|
|
81
|
+
"""
|
|
82
|
+
Run the onion deduplication binary.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
file_list_path: Path to file containing list of files to process
|
|
86
|
+
output_dir: Directory for output CSV files
|
|
87
|
+
dataset_name: Name of the dataset (for output files)
|
|
88
|
+
onion_binary: Path to onion binary (auto-detected if None)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Tuple of (success: bool, output_csv_dir: Path)
|
|
92
|
+
"""
|
|
93
|
+
if onion_binary is None:
|
|
94
|
+
onion_binary = find_onion_binary()
|
|
95
|
+
|
|
96
|
+
if onion_binary is None:
|
|
97
|
+
raise FileNotFoundError(
|
|
98
|
+
"Onion binary not found. Please install onion or provide path to binary.\n"
|
|
99
|
+
"Installation: https://corpus.tools/wiki/Onion"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
onion_binary = Path(onion_binary)
|
|
103
|
+
if not onion_binary.exists():
|
|
104
|
+
raise FileNotFoundError(f"Onion binary not found at: {onion_binary}")
|
|
105
|
+
|
|
106
|
+
output_dir = Path(output_dir)
|
|
107
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
|
|
109
|
+
# onion expects: ./onion -D <dataset-name> -L <file-list> -O <output-dir>
|
|
110
|
+
cmd = [
|
|
111
|
+
str(onion_binary),
|
|
112
|
+
"-D",
|
|
113
|
+
dataset_name,
|
|
114
|
+
"-L",
|
|
115
|
+
str(file_list_path),
|
|
116
|
+
"-O",
|
|
117
|
+
str(output_dir),
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
logger.info(f"Running onion: {' '.join(cmd)}")
|
|
121
|
+
logger.info(f"Processing file list: {file_list_path}")
|
|
122
|
+
logger.info(f"Output directory: {output_dir}")
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
# Run onion
|
|
126
|
+
result = subprocess.run(
|
|
127
|
+
cmd,
|
|
128
|
+
capture_output=True,
|
|
129
|
+
text=True,
|
|
130
|
+
check=False,
|
|
131
|
+
timeout=21600, # 6 hour timeout
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if result.returncode != 0:
|
|
135
|
+
logger.error(f"Onion failed with return code {result.returncode}")
|
|
136
|
+
logger.error(f"STDOUT: {result.stdout}")
|
|
137
|
+
logger.error(f"STDERR: {result.stderr}")
|
|
138
|
+
return False, None
|
|
139
|
+
|
|
140
|
+
logger.info("Onion execution completed successfully")
|
|
141
|
+
if result.stdout:
|
|
142
|
+
logger.debug(f"Onion output: {result.stdout}")
|
|
143
|
+
|
|
144
|
+
return True, output_dir
|
|
145
|
+
|
|
146
|
+
except subprocess.TimeoutExpired:
|
|
147
|
+
logger.error("Onion execution timed out after 1 hour")
|
|
148
|
+
return False, None
|
|
149
|
+
except Exception as e:
|
|
150
|
+
logger.error(f"Error running onion: {e}")
|
|
151
|
+
return False, None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def compile_onion(source_dir: Path) -> Path | None:
|
|
155
|
+
"""
|
|
156
|
+
Compile the onion binary from source.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
source_dir: Directory containing onion source code (with Makefile)
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Path to compiled binary or None if compilation failed
|
|
163
|
+
"""
|
|
164
|
+
source_dir = Path(source_dir)
|
|
165
|
+
|
|
166
|
+
if not source_dir.exists():
|
|
167
|
+
logger.error(f"Source directory not found: {source_dir}")
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
makefile = source_dir / "Makefile.g"
|
|
171
|
+
if not makefile.exists():
|
|
172
|
+
makefile = source_dir / "Makefile"
|
|
173
|
+
|
|
174
|
+
if not makefile.exists():
|
|
175
|
+
logger.error(f"No Makefile found in {source_dir}")
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
logger.info(f"Compiling onion from source: {source_dir}")
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
# Clean
|
|
182
|
+
subprocess.run(
|
|
183
|
+
["make", "clean"],
|
|
184
|
+
cwd=source_dir,
|
|
185
|
+
capture_output=True,
|
|
186
|
+
check=False,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Compile
|
|
190
|
+
makefile_flag = "-f Makefile.g" if (source_dir / "Makefile.g").exists() else ""
|
|
191
|
+
compile_cmd = f"make onion {makefile_flag}".split()
|
|
192
|
+
|
|
193
|
+
result = subprocess.run(
|
|
194
|
+
compile_cmd,
|
|
195
|
+
cwd=source_dir,
|
|
196
|
+
capture_output=True,
|
|
197
|
+
text=True,
|
|
198
|
+
check=False,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if result.returncode != 0:
|
|
202
|
+
logger.error(f"Compilation failed: {result.stderr}")
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
# Check for binary
|
|
206
|
+
binary_path = source_dir / "onion"
|
|
207
|
+
if binary_path.exists():
|
|
208
|
+
logger.info(f"Successfully compiled onion: {binary_path}")
|
|
209
|
+
return binary_path
|
|
210
|
+
else:
|
|
211
|
+
logger.error("Compilation succeeded but binary not found")
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error(f"Error during compilation: {e}")
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
__all__ = [
|
|
220
|
+
"find_onion_binary",
|
|
221
|
+
"run_onion",
|
|
222
|
+
"compile_onion",
|
|
223
|
+
]
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Postprocessing utilities for onion output.
|
|
3
|
+
|
|
4
|
+
Parses CSV files from onion and extracts duplicate information.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import contextlib
|
|
8
|
+
import csv
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
|
|
14
|
+
from dalla.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_onion_csv(csv_file: Path) -> list[tuple[str, int]]:
|
|
20
|
+
"""
|
|
21
|
+
Parse a single onion CSV file.
|
|
22
|
+
|
|
23
|
+
Onion CSV format:
|
|
24
|
+
- First column: file path
|
|
25
|
+
- Second column: line number (index in file list)
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
csv_file: Path to CSV file
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List of (file_path, line_number) tuples
|
|
32
|
+
"""
|
|
33
|
+
entries = []
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
with open(csv_file, encoding="utf-8") as f:
|
|
37
|
+
reader = csv.reader(f)
|
|
38
|
+
for row in reader:
|
|
39
|
+
if len(row) >= 2:
|
|
40
|
+
file_path = row[0].strip().strip('"')
|
|
41
|
+
line_number = int(row[1])
|
|
42
|
+
entries.append((file_path, line_number))
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.warning(f"Error parsing CSV {csv_file}: {e}")
|
|
45
|
+
|
|
46
|
+
return entries
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def extract_duplicates_from_csvs(
|
|
50
|
+
csv_dir: Path,
|
|
51
|
+
file_list_path: Path,
|
|
52
|
+
) -> dict[str, set[str]]:
|
|
53
|
+
"""
|
|
54
|
+
Extract duplicate groups from onion CSV outputs.
|
|
55
|
+
|
|
56
|
+
Each CSV file represents a group of duplicates.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
csv_dir: Directory containing onion output CSVs
|
|
60
|
+
file_list_path: Path to original file list (to resolve line numbers)
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Dictionary mapping each file to its set of duplicates
|
|
64
|
+
"""
|
|
65
|
+
csv_dir = Path(csv_dir)
|
|
66
|
+
|
|
67
|
+
file_list = []
|
|
68
|
+
with open(file_list_path, encoding="utf-8") as f:
|
|
69
|
+
file_list = [line.strip() for line in f if line.strip()]
|
|
70
|
+
|
|
71
|
+
logger.info(f"Loaded {len(file_list)} files from file list")
|
|
72
|
+
|
|
73
|
+
csv_files = list(csv_dir.glob("*.csv"))
|
|
74
|
+
logger.info(f"Found {len(csv_files)} CSV files in {csv_dir}")
|
|
75
|
+
|
|
76
|
+
duplicate_groups = []
|
|
77
|
+
|
|
78
|
+
for csv_file in tqdm(csv_files, desc="Parsing duplicate CSVs", unit="file"):
|
|
79
|
+
entries = parse_onion_csv(csv_file)
|
|
80
|
+
|
|
81
|
+
if not entries:
|
|
82
|
+
logger.debug(f"CSV {csv_file.name} has no entries")
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
logger.debug(f"CSV {csv_file.name}: {len(entries)} entries")
|
|
86
|
+
|
|
87
|
+
group = set()
|
|
88
|
+
for _, line_num in entries:
|
|
89
|
+
if 1 <= line_num <= len(file_list):
|
|
90
|
+
resolved_path = file_list[line_num - 1]
|
|
91
|
+
group.add(resolved_path)
|
|
92
|
+
logger.debug(f" Line {line_num} -> {Path(resolved_path).name}")
|
|
93
|
+
else:
|
|
94
|
+
logger.warning(f"Invalid line number {line_num} in {csv_file.name}")
|
|
95
|
+
|
|
96
|
+
logger.debug(f" Group has {len(group)} unique files")
|
|
97
|
+
if len(group) > 1:
|
|
98
|
+
duplicate_groups.append(group)
|
|
99
|
+
logger.info(f"Found duplicate group with {len(group)} files from {csv_file.name}")
|
|
100
|
+
else:
|
|
101
|
+
logger.debug(f" Skipping group with only {len(group)} file(s)")
|
|
102
|
+
|
|
103
|
+
logger.info(f"Found {len(duplicate_groups)} duplicate groups")
|
|
104
|
+
|
|
105
|
+
file_to_duplicates = defaultdict(set)
|
|
106
|
+
|
|
107
|
+
for group in duplicate_groups:
|
|
108
|
+
for file_path in group:
|
|
109
|
+
duplicates = group - {file_path}
|
|
110
|
+
file_to_duplicates[file_path].update(duplicates)
|
|
111
|
+
|
|
112
|
+
logger.info(f"Total files with duplicates: {len(file_to_duplicates)}")
|
|
113
|
+
|
|
114
|
+
return dict(file_to_duplicates)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def create_duplicate_pairs_with_scores(
|
|
118
|
+
csv_dir: Path,
|
|
119
|
+
file_list_path: Path,
|
|
120
|
+
) -> list[dict]:
|
|
121
|
+
"""
|
|
122
|
+
Create list of duplicate pairs with similarity scores.
|
|
123
|
+
|
|
124
|
+
Parses onion phase 2 output which includes similarity scores.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
csv_dir: Directory containing onion phase 2 CSVs
|
|
128
|
+
file_list_path: Path to file list
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
List of duplicate pair dictionaries with format:
|
|
132
|
+
{
|
|
133
|
+
'doc1': file_path_1,
|
|
134
|
+
'doc2': file_path_2,
|
|
135
|
+
'similarity': score (0.0-1.0)
|
|
136
|
+
}
|
|
137
|
+
"""
|
|
138
|
+
csv_dir = Path(csv_dir)
|
|
139
|
+
|
|
140
|
+
file_list = []
|
|
141
|
+
with open(file_list_path, encoding="utf-8") as f:
|
|
142
|
+
file_list = [line.strip() for line in f if line.strip()]
|
|
143
|
+
|
|
144
|
+
logger.info(f"Processing phase 2 outputs from {csv_dir}")
|
|
145
|
+
|
|
146
|
+
pairs = []
|
|
147
|
+
csv_files = list(csv_dir.glob("*.csv"))
|
|
148
|
+
|
|
149
|
+
for csv_file in tqdm(csv_files, desc="Processing similarity scores", unit="file"):
|
|
150
|
+
try:
|
|
151
|
+
with open(csv_file, encoding="utf-8") as f:
|
|
152
|
+
reader = csv.reader(f)
|
|
153
|
+
rows = list(reader)
|
|
154
|
+
|
|
155
|
+
if not rows:
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
for row in rows:
|
|
159
|
+
if len(row) < 2:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
file_path = row[0].strip().strip('"')
|
|
163
|
+
line_num = int(row[1]) if row[1].isdigit() else None
|
|
164
|
+
|
|
165
|
+
score = 1.0
|
|
166
|
+
if len(row) > 2:
|
|
167
|
+
with contextlib.suppress(ValueError, IndexError):
|
|
168
|
+
score = float(row[2])
|
|
169
|
+
|
|
170
|
+
if not file_path and line_num and 1 <= line_num <= len(file_list):
|
|
171
|
+
file_path = file_list[line_num - 1]
|
|
172
|
+
|
|
173
|
+
if file_path:
|
|
174
|
+
pairs.append(
|
|
175
|
+
{
|
|
176
|
+
"source_csv": csv_file.stem,
|
|
177
|
+
"file_path": file_path,
|
|
178
|
+
"similarity": score,
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.warning(f"Error processing {csv_file}: {e}")
|
|
184
|
+
|
|
185
|
+
logger.info(f"Extracted {len(pairs)} duplicate entries from CSVs")
|
|
186
|
+
|
|
187
|
+
csv_groups = defaultdict(list)
|
|
188
|
+
for entry in pairs:
|
|
189
|
+
csv_groups[entry["source_csv"]].append(entry)
|
|
190
|
+
|
|
191
|
+
duplicate_pairs = []
|
|
192
|
+
for _, entries in csv_groups.items():
|
|
193
|
+
if len(entries) < 2:
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
source = entries[0]["file_path"]
|
|
197
|
+
|
|
198
|
+
for entry in entries[1:]:
|
|
199
|
+
duplicate_pairs.append(
|
|
200
|
+
{
|
|
201
|
+
"doc1": source,
|
|
202
|
+
"doc2": entry["file_path"],
|
|
203
|
+
"similarity": entry["similarity"],
|
|
204
|
+
}
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
logger.info(f"Created {len(duplicate_pairs)} duplicate pairs")
|
|
208
|
+
|
|
209
|
+
return duplicate_pairs
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
__all__ = [
|
|
213
|
+
"parse_onion_csv",
|
|
214
|
+
"extract_duplicates_from_csvs",
|
|
215
|
+
"create_duplicate_pairs_with_scores",
|
|
216
|
+
]
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Preprocessing utilities for deduplication.
|
|
3
|
+
|
|
4
|
+
Handles text normalization and conversion to vertical format for onion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from camel_tools.utils.dediac import dediac_ar
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
from dalla.utils.logger import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def text_to_vertical(text: str, doc_id: str = "") -> str:
|
|
19
|
+
"""
|
|
20
|
+
Convert text to vertical format (one word per line).
|
|
21
|
+
|
|
22
|
+
This format is required by the onion deduplication algorithm.
|
|
23
|
+
Uses CAMEL Tools tokenizer to properly split words and preserve spaces.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
text: Input text (can be None, will be converted to empty string)
|
|
27
|
+
doc_id: Document identifier (optional)
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Text in vertical format with optional doc tags
|
|
31
|
+
"""
|
|
32
|
+
if text is None or not isinstance(text, str):
|
|
33
|
+
text = ""
|
|
34
|
+
|
|
35
|
+
text = dediac_ar(text)
|
|
36
|
+
|
|
37
|
+
words = text.split()
|
|
38
|
+
|
|
39
|
+
vertical = "\n".join(words)
|
|
40
|
+
|
|
41
|
+
if doc_id:
|
|
42
|
+
vertical = f"<doc name='{doc_id}'>\n{vertical}\n</doc>"
|
|
43
|
+
|
|
44
|
+
return vertical
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def create_vert_files(
|
|
48
|
+
texts: list[str],
|
|
49
|
+
ids: list[str],
|
|
50
|
+
output_dir: Path,
|
|
51
|
+
) -> tuple[list[Path], dict[str, str]]:
|
|
52
|
+
"""
|
|
53
|
+
Create vertical format files from texts.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
texts: List of text strings
|
|
57
|
+
ids: List of document IDs
|
|
58
|
+
output_dir: Directory to save vert files
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Tuple of (list of file paths, mapping of original IDs to vert file paths)
|
|
62
|
+
"""
|
|
63
|
+
output_dir = Path(output_dir)
|
|
64
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
logger.info(f"Creating {len(texts)} vert files in {output_dir}")
|
|
67
|
+
|
|
68
|
+
file_paths = []
|
|
69
|
+
id_mapping = {}
|
|
70
|
+
|
|
71
|
+
for idx, (text, doc_id) in enumerate(
|
|
72
|
+
tqdm(zip(texts, ids, strict=False), total=len(texts), desc="Creating vert files")
|
|
73
|
+
):
|
|
74
|
+
safe_filename = re.sub(r"[^\w\-_.]", "_", str(doc_id))
|
|
75
|
+
if len(safe_filename) > 200:
|
|
76
|
+
safe_filename = safe_filename[-200:]
|
|
77
|
+
|
|
78
|
+
filename = f"{idx:08d}_{safe_filename}.txt"
|
|
79
|
+
filepath = output_dir / filename
|
|
80
|
+
vert_text = text_to_vertical(text, doc_id)
|
|
81
|
+
|
|
82
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
83
|
+
f.write(vert_text)
|
|
84
|
+
|
|
85
|
+
file_paths.append(filepath)
|
|
86
|
+
id_mapping[str(doc_id)] = str(filepath)
|
|
87
|
+
|
|
88
|
+
logger.info(f"Created {len(file_paths)} vert files")
|
|
89
|
+
|
|
90
|
+
return file_paths, id_mapping
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def create_file_list(file_paths: list[Path], output_file: Path) -> Path:
|
|
94
|
+
"""
|
|
95
|
+
Create a text file listing all file paths (input for onion).
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
file_paths: List of file paths
|
|
99
|
+
output_file: Path to output file list
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Path to created file list
|
|
103
|
+
"""
|
|
104
|
+
output_file = Path(output_file)
|
|
105
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
|
|
107
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
108
|
+
for filepath in file_paths:
|
|
109
|
+
f.write(f"{filepath}\n")
|
|
110
|
+
|
|
111
|
+
logger.info(f"Created file list with {len(file_paths)} entries: {output_file}")
|
|
112
|
+
|
|
113
|
+
return output_file
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
__all__ = [
|
|
117
|
+
"text_to_vertical",
|
|
118
|
+
"create_vert_files",
|
|
119
|
+
"create_file_list",
|
|
120
|
+
]
|