dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,17 @@
1
+ /*********************************************************************
2
+ * Copyright (c) 2011-2016 Jan Pomikalek, Milos Jakubicek *
3
+ * All rights reserved. *
4
+ * *
5
+ * This software is licensed as described in the file COPYING, which *
6
+ * you should have received as part of this distribution. *
7
+ *********************************************************************/
8
+
9
+ #include <stdio.h>
10
+ #include <stdlib.h>
11
+ #include <string.h>
12
+ #include "version.h"
13
+
14
+ void print_version(const char* progname) {
15
+ printf("%s: onion v%s\n\n", progname, VERSION);
16
+ printf("Copyright (c) 2011-2020 Lexical Computing Limited and Lexical Computing CZ s.r.o.\n");
17
+ }
@@ -0,0 +1,10 @@
1
+ /*********************************************************************
2
+ * Copyright (c) 2011-2016 Jan Pomikalek, Milos Jakubicek *
3
+ * All rights reserved. *
4
+ * *
5
+ * This software is licensed as described in the file COPYING, which *
6
+ * you should have received as part of this distribution. *
7
+ *********************************************************************/
8
+
9
+ #define VERSION "1.4"
10
+ void print_version(const char* progname);
@@ -0,0 +1,223 @@
1
+ """
2
+ Wrapper for the onion C binary deduplication tool.
3
+ Handles execution of the onion binary and management of temporary files.
4
+ """
5
+
6
+ import shutil
7
+ import subprocess
8
+ from pathlib import Path
9
+
10
+ from dalla.utils.logger import get_logger
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ def find_onion_binary() -> Path | None:
16
+ """
17
+ Find the onion binary in the system.
18
+
19
+ Searches in:
20
+ 1. System PATH
21
+ 2. Bundled platform-specific binary in package
22
+ 3. Generic bundled binary (symlink)
23
+ 4. Local build directories
24
+
25
+ Returns:
26
+ Path to onion binary or None if not found
27
+ """
28
+ import platform
29
+
30
+ onion_path = shutil.which("onion")
31
+ if onion_path:
32
+ logger.info(f"Found onion in system PATH: {onion_path}")
33
+ return Path(onion_path)
34
+
35
+ package_dir = Path(__file__).parent
36
+ bin_dir = package_dir / "bin"
37
+
38
+ system = platform.system().lower()
39
+ machine = platform.machine().lower()
40
+
41
+ if system == "linux":
42
+ platform_name = "linux"
43
+ elif system == "darwin":
44
+ platform_name = "darwin"
45
+ elif system == "windows":
46
+ platform_name = "windows"
47
+ else:
48
+ platform_name = system
49
+
50
+ platform_binary = bin_dir / f"onion-{platform_name}-{machine}"
51
+ if platform_binary.exists() and platform_binary.is_file():
52
+ logger.info(f"Found bundled platform-specific binary: {platform_binary}")
53
+ return platform_binary
54
+
55
+ generic_binary = bin_dir / "onion"
56
+ if generic_binary.exists() and generic_binary.is_file():
57
+ logger.info(f"Found bundled onion binary: {generic_binary}")
58
+ return generic_binary
59
+
60
+ Path(__file__).parents[2]
61
+ local_builds = [
62
+ package_dir / "onion" / "src_sc" / "onion",
63
+ package_dir / "onion" / "src" / "onion",
64
+ ]
65
+
66
+ for local_bin in local_builds:
67
+ if local_bin.exists() and local_bin.is_file():
68
+ logger.info(f"Found local onion binary: {local_bin}")
69
+ return local_bin
70
+
71
+ logger.warning("Onion binary not found in PATH or package")
72
+ return None
73
+
74
+
75
+ def run_onion(
76
+ file_list_path: Path,
77
+ output_dir: Path,
78
+ dataset_name: str = "dataset",
79
+ onion_binary: Path | None = None,
80
+ ) -> tuple[bool, Path | None]:
81
+ """
82
+ Run the onion deduplication binary.
83
+
84
+ Args:
85
+ file_list_path: Path to file containing list of files to process
86
+ output_dir: Directory for output CSV files
87
+ dataset_name: Name of the dataset (for output files)
88
+ onion_binary: Path to onion binary (auto-detected if None)
89
+
90
+ Returns:
91
+ Tuple of (success: bool, output_csv_dir: Path)
92
+ """
93
+ if onion_binary is None:
94
+ onion_binary = find_onion_binary()
95
+
96
+ if onion_binary is None:
97
+ raise FileNotFoundError(
98
+ "Onion binary not found. Please install onion or provide path to binary.\n"
99
+ "Installation: https://corpus.tools/wiki/Onion"
100
+ )
101
+
102
+ onion_binary = Path(onion_binary)
103
+ if not onion_binary.exists():
104
+ raise FileNotFoundError(f"Onion binary not found at: {onion_binary}")
105
+
106
+ output_dir = Path(output_dir)
107
+ output_dir.mkdir(parents=True, exist_ok=True)
108
+
109
+ # onion expects: ./onion -D <dataset-name> -L <file-list> -O <output-dir>
110
+ cmd = [
111
+ str(onion_binary),
112
+ "-D",
113
+ dataset_name,
114
+ "-L",
115
+ str(file_list_path),
116
+ "-O",
117
+ str(output_dir),
118
+ ]
119
+
120
+ logger.info(f"Running onion: {' '.join(cmd)}")
121
+ logger.info(f"Processing file list: {file_list_path}")
122
+ logger.info(f"Output directory: {output_dir}")
123
+
124
+ try:
125
+ # Run onion
126
+ result = subprocess.run(
127
+ cmd,
128
+ capture_output=True,
129
+ text=True,
130
+ check=False,
131
+ timeout=21600, # 6 hour timeout
132
+ )
133
+
134
+ if result.returncode != 0:
135
+ logger.error(f"Onion failed with return code {result.returncode}")
136
+ logger.error(f"STDOUT: {result.stdout}")
137
+ logger.error(f"STDERR: {result.stderr}")
138
+ return False, None
139
+
140
+ logger.info("Onion execution completed successfully")
141
+ if result.stdout:
142
+ logger.debug(f"Onion output: {result.stdout}")
143
+
144
+ return True, output_dir
145
+
146
+ except subprocess.TimeoutExpired:
147
+ logger.error("Onion execution timed out after 1 hour")
148
+ return False, None
149
+ except Exception as e:
150
+ logger.error(f"Error running onion: {e}")
151
+ return False, None
152
+
153
+
154
+ def compile_onion(source_dir: Path) -> Path | None:
155
+ """
156
+ Compile the onion binary from source.
157
+
158
+ Args:
159
+ source_dir: Directory containing onion source code (with Makefile)
160
+
161
+ Returns:
162
+ Path to compiled binary or None if compilation failed
163
+ """
164
+ source_dir = Path(source_dir)
165
+
166
+ if not source_dir.exists():
167
+ logger.error(f"Source directory not found: {source_dir}")
168
+ return None
169
+
170
+ makefile = source_dir / "Makefile.g"
171
+ if not makefile.exists():
172
+ makefile = source_dir / "Makefile"
173
+
174
+ if not makefile.exists():
175
+ logger.error(f"No Makefile found in {source_dir}")
176
+ return None
177
+
178
+ logger.info(f"Compiling onion from source: {source_dir}")
179
+
180
+ try:
181
+ # Clean
182
+ subprocess.run(
183
+ ["make", "clean"],
184
+ cwd=source_dir,
185
+ capture_output=True,
186
+ check=False,
187
+ )
188
+
189
+ # Compile
190
+ makefile_flag = "-f Makefile.g" if (source_dir / "Makefile.g").exists() else ""
191
+ compile_cmd = f"make onion {makefile_flag}".split()
192
+
193
+ result = subprocess.run(
194
+ compile_cmd,
195
+ cwd=source_dir,
196
+ capture_output=True,
197
+ text=True,
198
+ check=False,
199
+ )
200
+
201
+ if result.returncode != 0:
202
+ logger.error(f"Compilation failed: {result.stderr}")
203
+ return None
204
+
205
+ # Check for binary
206
+ binary_path = source_dir / "onion"
207
+ if binary_path.exists():
208
+ logger.info(f"Successfully compiled onion: {binary_path}")
209
+ return binary_path
210
+ else:
211
+ logger.error("Compilation succeeded but binary not found")
212
+ return None
213
+
214
+ except Exception as e:
215
+ logger.error(f"Error during compilation: {e}")
216
+ return None
217
+
218
+
219
+ __all__ = [
220
+ "find_onion_binary",
221
+ "run_onion",
222
+ "compile_onion",
223
+ ]
@@ -0,0 +1,216 @@
1
+ """
2
+ Postprocessing utilities for onion output.
3
+
4
+ Parses CSV files from onion and extracts duplicate information.
5
+ """
6
+
7
+ import contextlib
8
+ import csv
9
+ from collections import defaultdict
10
+ from pathlib import Path
11
+
12
+ from tqdm import tqdm
13
+
14
+ from dalla.utils.logger import get_logger
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ def parse_onion_csv(csv_file: Path) -> list[tuple[str, int]]:
20
+ """
21
+ Parse a single onion CSV file.
22
+
23
+ Onion CSV format:
24
+ - First column: file path
25
+ - Second column: line number (index in file list)
26
+
27
+ Args:
28
+ csv_file: Path to CSV file
29
+
30
+ Returns:
31
+ List of (file_path, line_number) tuples
32
+ """
33
+ entries = []
34
+
35
+ try:
36
+ with open(csv_file, encoding="utf-8") as f:
37
+ reader = csv.reader(f)
38
+ for row in reader:
39
+ if len(row) >= 2:
40
+ file_path = row[0].strip().strip('"')
41
+ line_number = int(row[1])
42
+ entries.append((file_path, line_number))
43
+ except Exception as e:
44
+ logger.warning(f"Error parsing CSV {csv_file}: {e}")
45
+
46
+ return entries
47
+
48
+
49
+ def extract_duplicates_from_csvs(
50
+ csv_dir: Path,
51
+ file_list_path: Path,
52
+ ) -> dict[str, set[str]]:
53
+ """
54
+ Extract duplicate groups from onion CSV outputs.
55
+
56
+ Each CSV file represents a group of duplicates.
57
+
58
+ Args:
59
+ csv_dir: Directory containing onion output CSVs
60
+ file_list_path: Path to original file list (to resolve line numbers)
61
+
62
+ Returns:
63
+ Dictionary mapping each file to its set of duplicates
64
+ """
65
+ csv_dir = Path(csv_dir)
66
+
67
+ file_list = []
68
+ with open(file_list_path, encoding="utf-8") as f:
69
+ file_list = [line.strip() for line in f if line.strip()]
70
+
71
+ logger.info(f"Loaded {len(file_list)} files from file list")
72
+
73
+ csv_files = list(csv_dir.glob("*.csv"))
74
+ logger.info(f"Found {len(csv_files)} CSV files in {csv_dir}")
75
+
76
+ duplicate_groups = []
77
+
78
+ for csv_file in tqdm(csv_files, desc="Parsing duplicate CSVs", unit="file"):
79
+ entries = parse_onion_csv(csv_file)
80
+
81
+ if not entries:
82
+ logger.debug(f"CSV {csv_file.name} has no entries")
83
+ continue
84
+
85
+ logger.debug(f"CSV {csv_file.name}: {len(entries)} entries")
86
+
87
+ group = set()
88
+ for _, line_num in entries:
89
+ if 1 <= line_num <= len(file_list):
90
+ resolved_path = file_list[line_num - 1]
91
+ group.add(resolved_path)
92
+ logger.debug(f" Line {line_num} -> {Path(resolved_path).name}")
93
+ else:
94
+ logger.warning(f"Invalid line number {line_num} in {csv_file.name}")
95
+
96
+ logger.debug(f" Group has {len(group)} unique files")
97
+ if len(group) > 1:
98
+ duplicate_groups.append(group)
99
+ logger.info(f"Found duplicate group with {len(group)} files from {csv_file.name}")
100
+ else:
101
+ logger.debug(f" Skipping group with only {len(group)} file(s)")
102
+
103
+ logger.info(f"Found {len(duplicate_groups)} duplicate groups")
104
+
105
+ file_to_duplicates = defaultdict(set)
106
+
107
+ for group in duplicate_groups:
108
+ for file_path in group:
109
+ duplicates = group - {file_path}
110
+ file_to_duplicates[file_path].update(duplicates)
111
+
112
+ logger.info(f"Total files with duplicates: {len(file_to_duplicates)}")
113
+
114
+ return dict(file_to_duplicates)
115
+
116
+
117
+ def create_duplicate_pairs_with_scores(
118
+ csv_dir: Path,
119
+ file_list_path: Path,
120
+ ) -> list[dict]:
121
+ """
122
+ Create list of duplicate pairs with similarity scores.
123
+
124
+ Parses onion phase 2 output which includes similarity scores.
125
+
126
+ Args:
127
+ csv_dir: Directory containing onion phase 2 CSVs
128
+ file_list_path: Path to file list
129
+
130
+ Returns:
131
+ List of duplicate pair dictionaries with format:
132
+ {
133
+ 'doc1': file_path_1,
134
+ 'doc2': file_path_2,
135
+ 'similarity': score (0.0-1.0)
136
+ }
137
+ """
138
+ csv_dir = Path(csv_dir)
139
+
140
+ file_list = []
141
+ with open(file_list_path, encoding="utf-8") as f:
142
+ file_list = [line.strip() for line in f if line.strip()]
143
+
144
+ logger.info(f"Processing phase 2 outputs from {csv_dir}")
145
+
146
+ pairs = []
147
+ csv_files = list(csv_dir.glob("*.csv"))
148
+
149
+ for csv_file in tqdm(csv_files, desc="Processing similarity scores", unit="file"):
150
+ try:
151
+ with open(csv_file, encoding="utf-8") as f:
152
+ reader = csv.reader(f)
153
+ rows = list(reader)
154
+
155
+ if not rows:
156
+ continue
157
+
158
+ for row in rows:
159
+ if len(row) < 2:
160
+ continue
161
+
162
+ file_path = row[0].strip().strip('"')
163
+ line_num = int(row[1]) if row[1].isdigit() else None
164
+
165
+ score = 1.0
166
+ if len(row) > 2:
167
+ with contextlib.suppress(ValueError, IndexError):
168
+ score = float(row[2])
169
+
170
+ if not file_path and line_num and 1 <= line_num <= len(file_list):
171
+ file_path = file_list[line_num - 1]
172
+
173
+ if file_path:
174
+ pairs.append(
175
+ {
176
+ "source_csv": csv_file.stem,
177
+ "file_path": file_path,
178
+ "similarity": score,
179
+ }
180
+ )
181
+
182
+ except Exception as e:
183
+ logger.warning(f"Error processing {csv_file}: {e}")
184
+
185
+ logger.info(f"Extracted {len(pairs)} duplicate entries from CSVs")
186
+
187
+ csv_groups = defaultdict(list)
188
+ for entry in pairs:
189
+ csv_groups[entry["source_csv"]].append(entry)
190
+
191
+ duplicate_pairs = []
192
+ for _, entries in csv_groups.items():
193
+ if len(entries) < 2:
194
+ continue
195
+
196
+ source = entries[0]["file_path"]
197
+
198
+ for entry in entries[1:]:
199
+ duplicate_pairs.append(
200
+ {
201
+ "doc1": source,
202
+ "doc2": entry["file_path"],
203
+ "similarity": entry["similarity"],
204
+ }
205
+ )
206
+
207
+ logger.info(f"Created {len(duplicate_pairs)} duplicate pairs")
208
+
209
+ return duplicate_pairs
210
+
211
+
212
+ __all__ = [
213
+ "parse_onion_csv",
214
+ "extract_duplicates_from_csvs",
215
+ "create_duplicate_pairs_with_scores",
216
+ ]
@@ -0,0 +1,120 @@
1
+ """
2
+ Preprocessing utilities for deduplication.
3
+
4
+ Handles text normalization and conversion to vertical format for onion.
5
+ """
6
+
7
+ import re
8
+ from pathlib import Path
9
+
10
+ from camel_tools.utils.dediac import dediac_ar
11
+ from tqdm import tqdm
12
+
13
+ from dalla.utils.logger import get_logger
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ def text_to_vertical(text: str, doc_id: str = "") -> str:
19
+ """
20
+ Convert text to vertical format (one word per line).
21
+
22
+ This format is required by the onion deduplication algorithm.
23
+ Uses CAMEL Tools tokenizer to properly split words and preserve spaces.
24
+
25
+ Args:
26
+ text: Input text (can be None, will be converted to empty string)
27
+ doc_id: Document identifier (optional)
28
+
29
+ Returns:
30
+ Text in vertical format with optional doc tags
31
+ """
32
+ if text is None or not isinstance(text, str):
33
+ text = ""
34
+
35
+ text = dediac_ar(text)
36
+
37
+ words = text.split()
38
+
39
+ vertical = "\n".join(words)
40
+
41
+ if doc_id:
42
+ vertical = f"<doc name='{doc_id}'>\n{vertical}\n</doc>"
43
+
44
+ return vertical
45
+
46
+
47
+ def create_vert_files(
48
+ texts: list[str],
49
+ ids: list[str],
50
+ output_dir: Path,
51
+ ) -> tuple[list[Path], dict[str, str]]:
52
+ """
53
+ Create vertical format files from texts.
54
+
55
+ Args:
56
+ texts: List of text strings
57
+ ids: List of document IDs
58
+ output_dir: Directory to save vert files
59
+
60
+ Returns:
61
+ Tuple of (list of file paths, mapping of original IDs to vert file paths)
62
+ """
63
+ output_dir = Path(output_dir)
64
+ output_dir.mkdir(parents=True, exist_ok=True)
65
+
66
+ logger.info(f"Creating {len(texts)} vert files in {output_dir}")
67
+
68
+ file_paths = []
69
+ id_mapping = {}
70
+
71
+ for idx, (text, doc_id) in enumerate(
72
+ tqdm(zip(texts, ids, strict=False), total=len(texts), desc="Creating vert files")
73
+ ):
74
+ safe_filename = re.sub(r"[^\w\-_.]", "_", str(doc_id))
75
+ if len(safe_filename) > 200:
76
+ safe_filename = safe_filename[-200:]
77
+
78
+ filename = f"{idx:08d}_{safe_filename}.txt"
79
+ filepath = output_dir / filename
80
+ vert_text = text_to_vertical(text, doc_id)
81
+
82
+ with open(filepath, "w", encoding="utf-8") as f:
83
+ f.write(vert_text)
84
+
85
+ file_paths.append(filepath)
86
+ id_mapping[str(doc_id)] = str(filepath)
87
+
88
+ logger.info(f"Created {len(file_paths)} vert files")
89
+
90
+ return file_paths, id_mapping
91
+
92
+
93
+ def create_file_list(file_paths: list[Path], output_file: Path) -> Path:
94
+ """
95
+ Create a text file listing all file paths (input for onion).
96
+
97
+ Args:
98
+ file_paths: List of file paths
99
+ output_file: Path to output file list
100
+
101
+ Returns:
102
+ Path to created file list
103
+ """
104
+ output_file = Path(output_file)
105
+ output_file.parent.mkdir(parents=True, exist_ok=True)
106
+
107
+ with open(output_file, "w", encoding="utf-8") as f:
108
+ for filepath in file_paths:
109
+ f.write(f"{filepath}\n")
110
+
111
+ logger.info(f"Created file list with {len(file_paths)} entries: {output_file}")
112
+
113
+ return output_file
114
+
115
+
116
+ __all__ = [
117
+ "text_to_vertical",
118
+ "create_vert_files",
119
+ "create_file_list",
120
+ ]
@@ -0,0 +1,5 @@
1
+ """Quality checking module for text quality assessment."""
2
+
3
+ from dalla.quality.checker import QualityChecker, check_quality
4
+
5
+ __all__ = ["check_quality", "QualityChecker"]