dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,370 @@
1
+ """
2
+ Deduplication module using onion algorithm.
3
+
4
+ This module provides a streamlined interface to the onion deduplication tool,
5
+ wrapping the complex multi-step process into a single API call.
6
+ """
7
+
8
+ import gc
9
+ import shutil
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+ from datasets import Dataset
14
+ from tqdm import tqdm
15
+
16
+ from dalla.deduplication.onion_wrapper import find_onion_binary, run_onion
17
+ from dalla.deduplication.postprocessing import extract_duplicates_from_csvs
18
+ from dalla.deduplication.preprocessing import create_file_list, create_vert_files
19
+ from dalla.utils.logger import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ def deduplicate_dataset(
25
+ dataset: Dataset,
26
+ column: str = "text",
27
+ threshold: float = 0.8,
28
+ return_pairs: bool = True,
29
+ keep_vert_files: bool = False,
30
+ vert_dir: Path | None = None,
31
+ onion_binary: Path | None = None,
32
+ work_dir: Path | None = None,
33
+ calculate_scores: bool = False,
34
+ ) -> Dataset:
35
+ """
36
+ Remove duplicate entries from dataset using onion algorithm.
37
+
38
+ This function:
39
+ 1. Converts texts to vertical format (one word per line)
40
+ 2. Runs onion phase 1 (find duplicate groups)
41
+ 3. Optionally runs onion phase 2 (calculate similarity scores)
42
+ 4. Adds duplicate information to dataset
43
+
44
+ Args:
45
+ dataset: HuggingFace dataset
46
+ column: Column to check for duplicates (default: "text")
47
+ threshold: Similarity threshold 0.0-1.0 (default: 0.8)
48
+ Note: Onion uses this internally, output includes all pairs
49
+ return_pairs: If True, return dataset with duplicate pairs info
50
+ If False, return deduplicated dataset (keeps first occurrence)
51
+ keep_vert_files: Keep vertical format files for inspection (default: False)
52
+ vert_dir: Directory to store vertical files (default: work_dir/vert_files)
53
+ Useful if you want to store vert files on a different disk
54
+ onion_binary: Path to onion binary (auto-detected if None)
55
+ work_dir: Working directory for temp files (auto-created if None)
56
+ calculate_scores: Run phase 2 to calculate similarity scores (default: False)
57
+ Set to True if you need precise similarity measurements
58
+ Phase 1 is usually sufficient for deduplication
59
+
60
+ Returns:
61
+ Dataset with added columns:
62
+ - If return_pairs=True:
63
+ - duplicate_cluster: Cluster ID for duplicate groups
64
+ - is_duplicate: Boolean indicating if doc has duplicates
65
+ - duplicate_count: Number of duplicates found
66
+ - If return_pairs=False:
67
+ - Returns filtered dataset with duplicates removed
68
+
69
+ Example:
70
+ >>> # Get duplicate information
71
+ >>> result = deduplicate_dataset(dataset, return_pairs=True)
72
+ >>> duplicates = result.filter(lambda x: x['is_duplicate'])
73
+ >>>
74
+ >>> # Get deduplicated dataset
75
+ >>> deduped = deduplicate_dataset(dataset, return_pairs=False)
76
+ >>>
77
+ >>> # Store vert files on different disk
78
+ >>> deduped = deduplicate_dataset(
79
+ ... dataset,
80
+ ... vert_dir=Path("/mnt/large_disk/vert_files"),
81
+ ... keep_vert_files=True
82
+ ... )
83
+
84
+ Raises:
85
+ FileNotFoundError: If onion binary not found
86
+ RuntimeError: If onion execution fails
87
+ """
88
+ logger.info(f"Starting deduplication of {len(dataset)} examples")
89
+ logger.info(f"Column: {column}, Threshold: {threshold}, Return pairs: {return_pairs}")
90
+
91
+ # Check for onion binary
92
+ if onion_binary is None:
93
+ onion_binary = find_onion_binary()
94
+ if onion_binary is None:
95
+ raise FileNotFoundError(
96
+ "Onion binary not found. Please install onion or set ONION_BINARY environment variable.\n"
97
+ "Installation: https://corpus.tools/wiki/Onion"
98
+ )
99
+
100
+ logger.info(f"Using onion binary: {onion_binary}")
101
+
102
+ # Create working directory
103
+ if work_dir is None:
104
+ temp_dir = tempfile.mkdtemp(prefix="dalla_dedup_")
105
+ work_dir = Path(temp_dir)
106
+ cleanup_work_dir = True
107
+ else:
108
+ work_dir = Path(work_dir)
109
+ work_dir.mkdir(parents=True, exist_ok=True)
110
+ cleanup_work_dir = False
111
+
112
+ logger.info(f"Working directory: {work_dir}")
113
+
114
+ try:
115
+ # Step 1: Extract texts and IDs
116
+ logger.info("Step 1: Extracting texts from dataset...")
117
+ texts = []
118
+ ids = []
119
+
120
+ for idx, example in enumerate(tqdm(dataset, desc="Extracting texts")):
121
+ text = example.get(column, "")
122
+ texts.append(text)
123
+ ids.append(f"doc_{idx}")
124
+
125
+ logger.info(f"Extracted {len(texts)} texts")
126
+
127
+ # Check text lengths (onion needs sufficient content)
128
+ avg_tokens = sum(len(t.split()) for t in texts if t) / len(texts) if texts else 0
129
+ min_tokens = min((len(t.split()) for t in texts if t), default=0)
130
+ max_tokens = max((len(t.split()) for t in texts if t), default=0)
131
+
132
+ logger.info(
133
+ f"Text statistics: min={min_tokens}, avg={avg_tokens:.1f}, max={max_tokens} tokens"
134
+ )
135
+
136
+ if avg_tokens < 10:
137
+ logger.warning(
138
+ f"Texts are very short (avg {avg_tokens:.1f} tokens). "
139
+ "Onion requires at least ~10-15 tokens per text for reliable duplicate detection. "
140
+ "Short texts may be marked as 'bad' and not processed."
141
+ )
142
+
143
+ # Step 2: Create vertical format files
144
+ logger.info("Step 2: Creating vertical format files...")
145
+ # Use custom vert_dir if provided, otherwise use work_dir/vert_files
146
+ vert_dir = work_dir / "vert_files" if vert_dir is None else Path(vert_dir)
147
+
148
+ logger.info(f"Vertical files directory: {vert_dir}")
149
+ file_paths, id_mapping = create_vert_files(texts, ids, vert_dir)
150
+
151
+ # Free memory: texts list is no longer needed after creating vert files
152
+ del texts
153
+ gc.collect()
154
+
155
+ # Step 3: Create file list for onion
156
+ logger.info("Step 3: Creating file list...")
157
+ file_list_path = work_dir / "file_list.txt"
158
+ create_file_list(file_paths, file_list_path)
159
+
160
+ # Step 4: Run onion phase 1 (find duplicates)
161
+ logger.info("Step 4: Running onion phase 1 (finding duplicates)...")
162
+ phase1_output = work_dir / "phase1_output"
163
+ success, csv_dir = run_onion(
164
+ file_list_path,
165
+ phase1_output,
166
+ dataset_name="phase1",
167
+ onion_binary=onion_binary,
168
+ )
169
+
170
+ if not success or csv_dir is None:
171
+ raise RuntimeError("Onion phase 1 failed")
172
+
173
+ # Step 5: Extract duplicate file paths
174
+ logger.info("Step 5: Extracting duplicate paths...")
175
+ file_to_duplicates = extract_duplicates_from_csvs(csv_dir, file_list_path)
176
+
177
+ # Free memory: id_mapping no longer needed
178
+ del id_mapping
179
+ gc.collect()
180
+
181
+ if not file_to_duplicates:
182
+ logger.info("No duplicates found!")
183
+ if avg_tokens < 10:
184
+ logger.warning(
185
+ "No duplicates detected. This might be because texts are too short for onion. "
186
+ "Onion requires texts with at least 10-15 tokens for reliable duplicate detection. "
187
+ "Consider using a different deduplication method for very short texts."
188
+ )
189
+ # Add empty duplicate columns
190
+ return _add_empty_duplicate_columns(dataset)
191
+
192
+ # Step 6: Create file list of only duplicates for phase 2
193
+ logger.info(f"Step 6: Found {len(file_to_duplicates)} files with duplicates")
194
+ duplicate_files = list(
195
+ set(
196
+ list(file_to_duplicates) + [d for dups in file_to_duplicates.values() for d in dups]
197
+ )
198
+ )
199
+
200
+ duplicate_file_list = work_dir / "duplicate_files.txt"
201
+ create_file_list([Path(f) for f in duplicate_files], duplicate_file_list)
202
+
203
+ # Step 7: Optionally run onion phase 2 (calculate scores)
204
+ csv_dir2 = None
205
+ if calculate_scores:
206
+ logger.info("Step 7: Running onion phase 2 (calculating similarity scores)...")
207
+ phase2_output = work_dir / "phase2_output"
208
+ success, csv_dir2 = run_onion(
209
+ duplicate_file_list,
210
+ phase2_output,
211
+ dataset_name="phase2",
212
+ onion_binary=onion_binary,
213
+ )
214
+
215
+ if not success:
216
+ logger.warning("Onion phase 2 failed, using phase 1 results only")
217
+ csv_dir2 = None
218
+ else:
219
+ logger.info("Step 7: Skipping phase 2 (calculate_scores=False)")
220
+
221
+ # Free memory: duplicate_files list no longer needed
222
+ del duplicate_files
223
+ gc.collect()
224
+
225
+ # Step 8: Parse results and add to dataset
226
+ logger.info("Step 8: Processing results...")
227
+
228
+ if return_pairs:
229
+ # Add duplicate information columns
230
+ result = _add_duplicate_columns(
231
+ dataset,
232
+ file_paths,
233
+ ids,
234
+ file_to_duplicates,
235
+ csv_dir2,
236
+ duplicate_file_list,
237
+ )
238
+ else:
239
+ # Filter out duplicates (keep first occurrence)
240
+ result = _filter_duplicates(
241
+ dataset,
242
+ file_paths,
243
+ ids,
244
+ file_to_duplicates,
245
+ )
246
+
247
+ logger.info("Deduplication complete!")
248
+
249
+ # Free memory: cleanup large intermediate objects
250
+ del file_paths, ids, file_to_duplicates
251
+ gc.collect()
252
+
253
+ return result
254
+
255
+ finally:
256
+ # Cleanup
257
+ if cleanup_work_dir and not keep_vert_files:
258
+ logger.info(f"Cleaning up working directory: {work_dir}")
259
+ shutil.rmtree(work_dir, ignore_errors=True)
260
+ elif keep_vert_files:
261
+ logger.info(f"Vertical files kept in: {work_dir}")
262
+
263
+
264
+ def _add_empty_duplicate_columns(dataset: Dataset) -> Dataset:
265
+ """Add empty duplicate columns when no duplicates found."""
266
+
267
+ def add_columns(example):
268
+ example["duplicate_cluster"] = -1
269
+ example["is_duplicate"] = False
270
+ example["duplicate_count"] = 0
271
+ return example
272
+
273
+ return dataset.map(add_columns, desc="Adding duplicate columns")
274
+
275
+
276
+ def _add_duplicate_columns(
277
+ dataset: Dataset,
278
+ file_paths: list[Path],
279
+ ids: list[str],
280
+ file_to_duplicates: dict[str, set],
281
+ csv_dir: Path | None,
282
+ duplicate_file_list: Path,
283
+ ) -> Dataset:
284
+ """Add duplicate information columns to dataset."""
285
+
286
+ # Build mapping from doc ID to cluster ID
287
+ clusters = {}
288
+ cluster_id = 0
289
+
290
+ # Group duplicates into clusters
291
+ processed = set()
292
+ for file_path, duplicates in file_to_duplicates.items():
293
+ if file_path in processed:
294
+ continue
295
+
296
+ # Find all connected duplicates (transitive closure)
297
+ cluster = {file_path}
298
+ cluster.update(duplicates)
299
+
300
+ # Mark all as same cluster
301
+ for f in cluster:
302
+ # Get doc ID from file path
303
+ for idx, fp in enumerate(file_paths):
304
+ if str(fp) == str(f):
305
+ clusters[ids[idx]] = cluster_id
306
+ processed.add(f)
307
+
308
+ cluster_id += 1
309
+
310
+ logger.info(f"Created {cluster_id} duplicate clusters")
311
+
312
+ # Add columns to dataset
313
+ def add_duplicate_info(example, idx):
314
+ doc_id = ids[idx]
315
+
316
+ if doc_id in clusters: # noqa: F821
317
+ example["duplicate_cluster"] = clusters[doc_id] # noqa: F821
318
+ example["is_duplicate"] = True
319
+ # Count how many others in same cluster
320
+ cluster_size = sum(1 for cid in clusters.values() if cid == clusters[doc_id]) # noqa: F821
321
+ example["duplicate_count"] = cluster_size - 1 # Exclude self
322
+ else:
323
+ example["duplicate_cluster"] = -1
324
+ example["is_duplicate"] = False
325
+ example["duplicate_count"] = 0
326
+
327
+ return example
328
+
329
+ result = dataset.map(add_duplicate_info, with_indices=True, desc="Adding duplicate info")
330
+
331
+ # Log statistics
332
+ num_duplicates = sum(1 for cid in clusters.values())
333
+ logger.info(
334
+ f"Found {num_duplicates} documents with duplicates ({num_duplicates / len(dataset) * 100:.1f}%)"
335
+ )
336
+
337
+ # Free memory
338
+ del clusters, processed
339
+ gc.collect()
340
+
341
+ return result
342
+
343
+
344
+ def _filter_duplicates(
345
+ dataset: Dataset,
346
+ file_paths: list[Path],
347
+ ids: list[str],
348
+ file_to_duplicates: dict[str, set],
349
+ ) -> Dataset:
350
+ """Filter out duplicates, keeping only first occurrence."""
351
+
352
+ # Build set of indices to keep
353
+ to_remove = set()
354
+
355
+ for _file_path, duplicates in file_to_duplicates.items():
356
+ # Find indices of duplicates
357
+ for dup in duplicates:
358
+ for idx, fp in enumerate(file_paths):
359
+ if str(fp) == str(dup):
360
+ to_remove.add(idx)
361
+
362
+ indices_to_keep = [idx for idx in range(len(dataset)) if idx not in to_remove]
363
+
364
+ logger.info(f"Keeping {len(indices_to_keep)} / {len(dataset)} documents")
365
+ logger.info(f"Removed {len(to_remove)} duplicates ({len(to_remove) / len(dataset) * 100:.1f}%)")
366
+
367
+ return dataset.select(indices_to_keep)
368
+
369
+
370
+ __all__ = ["deduplicate_dataset"]
@@ -0,0 +1 @@
1
+ onion-*
@@ -0,0 +1,24 @@
1
+ Copyright (c) 2011-2021 Lexical Computing Limited and Lexical Computing CZ s.r.o.
2
+ ALL RIGHTS RESERVED.
3
+ Moral rights holders: Milos Jakubicek, Jan Pomikalek, Vit Suchomel
4
+ as indicated in individual source files
5
+
6
+ Redistribution and use in source and binary forms, with or without modification,
7
+ are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+ * Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
16
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
19
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
22
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,21 @@
1
+ include Makefile.config
2
+
3
+ VERSION=$(shell grep define\ VERSION\ src/version.h | cut -f2 -d\")
4
+ PACKAGE_NAME="onion-$(VERSION)"
5
+
6
+ all:
7
+ cd src && make
8
+
9
+ install:
10
+ cd src && make install
11
+ mkdir -p $(INSTALL_DATA) && cp -r doc/ $(INSTALL_DATA)
12
+
13
+ clean:
14
+ cd src && make clean
15
+
16
+ dist:
17
+ rm -rf packages/$(PACKAGE_NAME)/ packages/$(PACKAGE_NAME).tar.gz
18
+ mkdir -p packages/$(PACKAGE_NAME)/src/
19
+ cp src/*.c src/*.h src/Makefile packages/$(PACKAGE_NAME)/src/
20
+ cp -r Makefile Makefile.config COPYING README doc packages/$(PACKAGE_NAME)/
21
+ cd packages && tar -czvf $(PACKAGE_NAME).tar.gz $(PACKAGE_NAME)/
@@ -0,0 +1,3 @@
1
+ PREFIX=/usr/local
2
+ INSTALL_BIN=$(PREFIX)/bin
3
+ INSTALL_DATA=$(PREFIX)/share
@@ -0,0 +1,21 @@
1
+ onion -- One Instance Only
2
+ ==========================
3
+
4
+ Prerequisites:
5
+ 64-bit CPU architecture
6
+ Google sparse hashset (https://github.com/sparsehash/sparsehash)
7
+
8
+ To install the package:
9
+ 1. edit Makefile.config:
10
+ - set PREFIX (or INSTALL_BIN and INSTALL_DATA) according to where you
11
+ want the executables and data installed
12
+ 2. type:
13
+ make
14
+ make install
15
+
16
+ For usage information see:
17
+ onion -h
18
+ man onion
19
+
20
+ Project home page:
21
+ <http://corpus.tools/wiki/Onion>
@@ -0,0 +1,22 @@
1
+ include ../Makefile.config
2
+
3
+ CC=g++
4
+ CFLAGS=-Wall -O3
5
+
6
+ OBJS=version.o buzhash.o
7
+ TARGETS=hashgen hashdup onion
8
+ all: $(TARGETS)
9
+
10
+ $(TARGETS): $(OBJS)
11
+
12
+ %.o: %.c %.h
13
+ $(CC) $(CFLAGS) -c $<
14
+
15
+ %: %.c
16
+ $(CC) $(CFLAGS) $^ $(LIBS) -o $@
17
+
18
+ install: $(TARGETS)
19
+ mkdir -p $(INSTALL_BIN) && install -m 755 $(TARGETS) $(INSTALL_BIN)
20
+
21
+ clean:
22
+ rm -f $(OBJS) $(TARGETS)
@@ -0,0 +1,23 @@
1
+ include ../Makefile.config
2
+
3
+ CC=g++
4
+ #CFLAGS=-Wall -O3
5
+ CFLAGS=-Wall -g
6
+
7
+ OBJS=version.o buzhash.o
8
+ TARGETS=hashgen hashdup onion
9
+ all: $(TARGETS)
10
+
11
+ $(TARGETS): $(OBJS)
12
+
13
+ %.o: %.c %.h
14
+ $(CC) $(CFLAGS) -c $<
15
+
16
+ %: %.c
17
+ $(CC) $(CFLAGS) $^ $(LIBS) -o $@
18
+
19
+ install: $(TARGETS)
20
+ mkdir -p $(INSTALL_BIN) && install -m 755 $(TARGETS) $(INSTALL_BIN)
21
+
22
+ clean:
23
+ rm -f $(OBJS) $(TARGETS)