dalla-data-processing 0.0.11__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/.pre-commit-config.yaml +0 -2
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/PKG-INFO +5 -5
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/README.md +4 -4
- dalla_data_processing-0.0.12/dalla_data_processing/_version.py +24 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/__init__.py +17 -5
- dalla_data_processing-0.0.12/dalla_data_processing/readability/arabic_flesch.py +147 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/ranking.py +51 -41
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/scorer.py +10 -13
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/utils/tokenize.py +2 -3
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/PKG-INFO +5 -5
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/SOURCES.txt +1 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/scripts/build_onion.sh +4 -3
- dalla_data_processing-0.0.11/dalla_data_processing/_version.py +0 -34
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/.dockerignore +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/.github/workflows/ci.yml +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/.github/workflows/release.yml +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/.gitignore +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/LICENSE +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/MANIFEST.in +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/__init__.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/cli.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/core/README.md +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/core/__init__.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/core/dataset.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/core/parallel.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/README.md +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/__init__.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/bin/.gitignore +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/COPYING +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/Makefile +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/Makefile.config +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/README.md +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/Makefile +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/Makefile.g +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/buzhash.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/buzhash.h +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/hashdup.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/hashgen.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/onion +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/onion.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/onion_dup.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/version.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/version.h +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/.gitignore +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/Makefile +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/Makefile.g +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/buzhash.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/buzhash.h +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/hashdup +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/hashdup.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/hashgen +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/hashgen.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/onion.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/onion_dup.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/version.c +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/version.h +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion_wrapper.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/postprocessing.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/preprocessing.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/packing/README.md +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/packing/__init__.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/packing/dataset_packer.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/packing/pack_config.example.yaml +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/quality/README.md +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/quality/__init__.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/quality/checker.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/README.md +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/README.md +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/__init__.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/data/words_al.txt +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/data/words_al_t.txt +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/data/words_t.txt +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/stemmer.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/utils/__init__.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/utils/logger.py +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/dependency_links.txt +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/entry_points.txt +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/not-zip-safe +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/requires.txt +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/top_level.txt +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/pyproject.toml +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/scripts/release.sh +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/setup.cfg +0 -0
- {dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/uv.lock +0 -0
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
# Run manually: pre-commit run --all-files
|
|
4
4
|
|
|
5
5
|
repos:
|
|
6
|
-
# Ruff - Fast Python linter and formatter
|
|
7
6
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
8
7
|
rev: v0.1.15
|
|
9
8
|
hooks:
|
|
@@ -11,6 +10,5 @@ repos:
|
|
|
11
10
|
args: [--fix, --exit-non-zero-on-fix]
|
|
12
11
|
- id: ruff-format
|
|
13
12
|
|
|
14
|
-
# Configuration
|
|
15
13
|
default_language_version:
|
|
16
14
|
python: python3.12
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dalla-data-processing
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
|
|
5
5
|
Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
|
|
6
6
|
License: CC-BY-NC-SA-4.0
|
|
@@ -57,7 +57,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
|
|
|
57
57
|
|
|
58
58
|
- **Linux**: Fully supported
|
|
59
59
|
- **macOS**: Fully supported (Intel or through rosetta)
|
|
60
|
-
- **Windows**: Supported through WSL
|
|
60
|
+
- **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
|
|
61
61
|
|
|
62
62
|
## Installation
|
|
63
63
|
|
|
@@ -98,7 +98,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
|
|
|
98
98
|
|
|
99
99
|
### Development Installation
|
|
100
100
|
|
|
101
|
-
<b>From Source (with uv
|
|
101
|
+
<b>From Source (with uv)</b>
|
|
102
102
|
|
|
103
103
|
```bash
|
|
104
104
|
git clone https://github.com/U4RASD/dalla-data-processing.git
|
|
@@ -148,6 +148,6 @@ Pack and prepare datasets for training.
|
|
|
148
148
|
|
|
149
149
|
## Links
|
|
150
150
|
|
|
151
|
-
- Homepage: https://
|
|
152
|
-
- Issues: https://github.com/U4RASD/dalla-data-processing/issues
|
|
151
|
+
- Homepage: https://acrps.ai
|
|
153
152
|
- Documentation: https://github.com/U4RASD/dalla-data-processing#readme
|
|
153
|
+
- ACRPS: https://acr.ps
|
|
@@ -6,7 +6,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
|
|
|
6
6
|
|
|
7
7
|
- **Linux**: Fully supported
|
|
8
8
|
- **macOS**: Fully supported (Intel or through rosetta)
|
|
9
|
-
- **Windows**: Supported through WSL
|
|
9
|
+
- **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
|
|
10
10
|
|
|
11
11
|
## Installation
|
|
12
12
|
|
|
@@ -47,7 +47,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
|
|
|
47
47
|
|
|
48
48
|
### Development Installation
|
|
49
49
|
|
|
50
|
-
<b>From Source (with uv
|
|
50
|
+
<b>From Source (with uv)</b>
|
|
51
51
|
|
|
52
52
|
```bash
|
|
53
53
|
git clone https://github.com/U4RASD/dalla-data-processing.git
|
|
@@ -97,6 +97,6 @@ Pack and prepare datasets for training.
|
|
|
97
97
|
|
|
98
98
|
## Links
|
|
99
99
|
|
|
100
|
-
- Homepage: https://
|
|
101
|
-
- Issues: https://github.com/U4RASD/dalla-data-processing/issues
|
|
100
|
+
- Homepage: https://acrps.ai
|
|
102
101
|
- Documentation: https://github.com/U4RASD/dalla-data-processing#readme
|
|
102
|
+
- ACRPS: https://acr.ps
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '0.0.12'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 0, 12)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = 'g3a0e013e2'
|
|
@@ -2,7 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
from datasets import Dataset
|
|
4
4
|
|
|
5
|
-
from dalla_data_processing.readability.ranking import
|
|
5
|
+
from dalla_data_processing.readability.ranking import (
|
|
6
|
+
OSMAN_WEIGHT,
|
|
7
|
+
WEIGHTED,
|
|
8
|
+
compute_ranks_and_levels,
|
|
9
|
+
)
|
|
6
10
|
from dalla_data_processing.readability.scorer import ReadabilityScorer
|
|
7
11
|
from dalla_data_processing.utils.logger import get_logger
|
|
8
12
|
|
|
@@ -14,6 +18,8 @@ def score_readability(
|
|
|
14
18
|
column: str = "text",
|
|
15
19
|
add_ranks: bool = True,
|
|
16
20
|
num_proc: int | None = None,
|
|
21
|
+
level_method: str = WEIGHTED,
|
|
22
|
+
osman_weight: float = OSMAN_WEIGHT,
|
|
17
23
|
) -> Dataset:
|
|
18
24
|
"""
|
|
19
25
|
Score readability using Flesch and Osman methods, with optional ranking.
|
|
@@ -32,6 +38,8 @@ def score_readability(
|
|
|
32
38
|
column: Column to score
|
|
33
39
|
add_ranks: Whether to add ranking columns (default: True)
|
|
34
40
|
num_proc: Number of parallel processes
|
|
41
|
+
level_method: Bin-combination strategy, "weighted" or "conservative"
|
|
42
|
+
osman_weight: Weight on the Osman bin when level_method="weighted"
|
|
35
43
|
|
|
36
44
|
Returns:
|
|
37
45
|
Dataset with readability scores and optional rankings
|
|
@@ -103,15 +111,17 @@ def score_readability(
|
|
|
103
111
|
|
|
104
112
|
# Step 2: Add ranks if requested
|
|
105
113
|
if add_ranks:
|
|
106
|
-
logger.info("Computing ranks and readability levels...")
|
|
107
|
-
scored_dataset = _add_ranks_to_dataset(scored_dataset)
|
|
114
|
+
logger.info(f"Computing ranks and readability levels (method={level_method})...")
|
|
115
|
+
scored_dataset = _add_ranks_to_dataset(scored_dataset, level_method, osman_weight)
|
|
108
116
|
logger.info("Ranks and levels added")
|
|
109
117
|
|
|
110
118
|
logger.info("Readability scoring complete!")
|
|
111
119
|
return scored_dataset
|
|
112
120
|
|
|
113
121
|
|
|
114
|
-
def _add_ranks_to_dataset(
|
|
122
|
+
def _add_ranks_to_dataset(
|
|
123
|
+
dataset: Dataset, level_method: str = WEIGHTED, osman_weight: float = OSMAN_WEIGHT
|
|
124
|
+
) -> Dataset:
|
|
115
125
|
"""
|
|
116
126
|
Add ranking columns to dataset based on scores.
|
|
117
127
|
|
|
@@ -153,7 +163,9 @@ def _add_ranks_to_dataset(dataset: Dataset) -> Dataset:
|
|
|
153
163
|
return dataset
|
|
154
164
|
|
|
155
165
|
# Compute ranks and levels
|
|
156
|
-
o_ranks, f_ranks, final_levels = compute_ranks_and_levels(
|
|
166
|
+
o_ranks, f_ranks, final_levels = compute_ranks_and_levels(
|
|
167
|
+
osman_scores, flesch_scores, method=level_method, osman_weight=osman_weight
|
|
168
|
+
)
|
|
157
169
|
|
|
158
170
|
# Create mapping from index to rank data
|
|
159
171
|
rank_data = {}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Arabic Flesch Reading Ease scoring.
|
|
3
|
+
|
|
4
|
+
textstat's flesch_reading_ease() cannot score Arabic: it counts syllables with
|
|
5
|
+
pyphen, which ships no Arabic hyphenation dictionary and raises KeyError. This
|
|
6
|
+
module instead counts Arabic syllables directly from diacritics, with a
|
|
7
|
+
character-length fallback for undiacritised text, so Flesch is computable for
|
|
8
|
+
Arabic in all cases.
|
|
9
|
+
|
|
10
|
+
Ported from the original OSMAN readability implementation by Mahmoud El-Haj
|
|
11
|
+
(OsmanReadability.java, Syllables.java).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
|
|
16
|
+
# Short vowels (harakat): fatha, damma, kasra.
|
|
17
|
+
HARAKAT = ("َ", "ُ", "ِ")
|
|
18
|
+
# Long-vowel letters that turn a preceding haraka into a long syllable: alef, waw, yaa.
|
|
19
|
+
LONG_LETTERS = ("ا", "و", "ي")
|
|
20
|
+
# Stress marks: tanween fath, tanween damm, tanween kasr, shadda.
|
|
21
|
+
STRESS_MARKS = ("ً", "ٌ", "ٍ", "ّ")
|
|
22
|
+
|
|
23
|
+
PUNCT_PATTERN = re.compile(r"[^\w\s]", flags=re.UNICODE)
|
|
24
|
+
DIGIT_PATTERN = re.compile(r"\d")
|
|
25
|
+
SENTENCE_PATTERN = re.compile(r"\n|(?<!\d)\.(?!\d)")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def count_all_syllables(text: str) -> tuple[int, int, int]:
|
|
29
|
+
"""
|
|
30
|
+
Count Arabic short, long, and stress syllables.
|
|
31
|
+
|
|
32
|
+
Long syllables are harakat followed by a long-vowel letter; the remaining
|
|
33
|
+
harakat are short. Stress syllables are tanween and shadda marks. For
|
|
34
|
+
undiacritised text (no short syllables found), short syllables are
|
|
35
|
+
approximated from the stripped character length.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
text: Text to analyse
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Tuple of (short_syllables, long_syllables, stress_syllables)
|
|
42
|
+
"""
|
|
43
|
+
long_count = 0
|
|
44
|
+
short_count = 0
|
|
45
|
+
|
|
46
|
+
for haraka in HARAKAT:
|
|
47
|
+
for i, char in enumerate(text):
|
|
48
|
+
if char == haraka:
|
|
49
|
+
if i + 1 < len(text) and text[i + 1] in LONG_LETTERS:
|
|
50
|
+
long_count += 1
|
|
51
|
+
else:
|
|
52
|
+
short_count += 1
|
|
53
|
+
|
|
54
|
+
stress_count = sum(text.count(mark) for mark in STRESS_MARKS)
|
|
55
|
+
|
|
56
|
+
# Fallback for undiacritised text: approximate short syllables from length.
|
|
57
|
+
if short_count == 0:
|
|
58
|
+
stripped = (
|
|
59
|
+
text.replace("ا", "")
|
|
60
|
+
.replace("ى", "")
|
|
61
|
+
.replace("?", "")
|
|
62
|
+
.replace(".", "")
|
|
63
|
+
.replace("!", "")
|
|
64
|
+
.replace(",", "")
|
|
65
|
+
.replace(" ", "")
|
|
66
|
+
)
|
|
67
|
+
short_count = len(stripped) - 2
|
|
68
|
+
|
|
69
|
+
return short_count, long_count, stress_count
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def count_syllables(text: str) -> int:
|
|
73
|
+
"""
|
|
74
|
+
Count total syllables, weighting long and stress syllables double.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
text: Text to analyse
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Total syllable count
|
|
81
|
+
"""
|
|
82
|
+
short_syl, long_syl, stress_syl = count_all_syllables(text)
|
|
83
|
+
return (long_syl * 2) + short_syl + (stress_syl * 2)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def count_words(text: str) -> int:
|
|
87
|
+
"""
|
|
88
|
+
Count words after removing digits and punctuation.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
text: Text to analyse
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Number of whitespace-separated words
|
|
95
|
+
"""
|
|
96
|
+
cleaned = DIGIT_PATTERN.sub("", text)
|
|
97
|
+
cleaned = PUNCT_PATTERN.sub("", cleaned)
|
|
98
|
+
cleaned = re.sub(r" +", " ", cleaned.strip())
|
|
99
|
+
return len(cleaned.split()) if cleaned else 0
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def count_sentences(text: str) -> int:
|
|
103
|
+
"""
|
|
104
|
+
Count sentences by splitting on newlines and non-decimal periods.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
text: Text to analyse
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Number of sentences
|
|
111
|
+
"""
|
|
112
|
+
# Match Java's String.split(regex), which discards trailing empty strings,
|
|
113
|
+
# so a trailing period does not count as an extra empty sentence.
|
|
114
|
+
parts = SENTENCE_PATTERN.split(text)
|
|
115
|
+
while parts and parts[-1] == "":
|
|
116
|
+
parts.pop()
|
|
117
|
+
return len(parts)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def words_per_sentence(text: str) -> float:
|
|
121
|
+
"""Return the average number of words per sentence."""
|
|
122
|
+
words = count_words(text)
|
|
123
|
+
sentences = count_sentences(text)
|
|
124
|
+
return words / sentences if sentences else float(words)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def syllables_per_word(text: str) -> float:
|
|
128
|
+
"""Return the average number of syllables per word."""
|
|
129
|
+
words = count_words(text)
|
|
130
|
+
return count_syllables(text) / words if words else 0.0
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def arabic_flesch_reading_ease(text: str) -> float | None:
|
|
134
|
+
"""
|
|
135
|
+
Calculate Arabic Flesch Reading Ease.
|
|
136
|
+
|
|
137
|
+
Score = 206.835 - 1.015 * (words / sentence) - 84.6 * (syllables / word)
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
text: Text to score
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Flesch Reading Ease score, or None for empty or word-less text
|
|
144
|
+
"""
|
|
145
|
+
if not text or not text.strip() or count_words(text) == 0:
|
|
146
|
+
return None
|
|
147
|
+
return 206.835 - (1.015 * words_per_sentence(text)) - (84.6 * syllables_per_word(text))
|
|
@@ -8,9 +8,21 @@ from dalla_data_processing.utils.logger import get_logger
|
|
|
8
8
|
|
|
9
9
|
logger = get_logger(__name__)
|
|
10
10
|
|
|
11
|
+
# Strategies for combining the Osman and Flesch bins into a final level.
|
|
12
|
+
WEIGHTED = "weighted" # Osman-dominant weighted average (default)
|
|
13
|
+
CONSERVATIVE = "conservative" # legacy regime-split (Option B3)
|
|
14
|
+
LEVEL_METHODS = (WEIGHTED, CONSERVATIVE)
|
|
15
|
+
|
|
16
|
+
# Default weight on the Osman bin for the "weighted" method. Osman is the more
|
|
17
|
+
# reliable signal for Arabic, so it dominates; Flesch only nudges the result.
|
|
18
|
+
OSMAN_WEIGHT = 0.8
|
|
19
|
+
|
|
11
20
|
|
|
12
21
|
def compute_ranks_and_levels(
|
|
13
|
-
osman_scores: list[float],
|
|
22
|
+
osman_scores: list[float],
|
|
23
|
+
flesch_scores: list[float],
|
|
24
|
+
method: str = WEIGHTED,
|
|
25
|
+
osman_weight: float = OSMAN_WEIGHT,
|
|
14
26
|
) -> tuple[list[int], list[int], list[int]]:
|
|
15
27
|
"""
|
|
16
28
|
Compute ranks and final readability levels.
|
|
@@ -18,11 +30,13 @@ def compute_ranks_and_levels(
|
|
|
18
30
|
Methodology:
|
|
19
31
|
1. Rank documents by Osman & Flesch (highest score = rank 1, easiest)
|
|
20
32
|
2. Bin ranks into 5 levels (0-4) using quantiles (guarantees balanced bins)
|
|
21
|
-
3. Decide final level
|
|
33
|
+
3. Decide the final level from the two bins (see decide_final_level)
|
|
22
34
|
|
|
23
35
|
Args:
|
|
24
36
|
osman_scores: List of Osman scores
|
|
25
37
|
flesch_scores: List of Flesch scores
|
|
38
|
+
method: How to combine the bins ("weighted" or "conservative")
|
|
39
|
+
osman_weight: Weight on the Osman bin when method="weighted"
|
|
26
40
|
|
|
27
41
|
Returns:
|
|
28
42
|
Tuple of:
|
|
@@ -51,7 +65,10 @@ def compute_ranks_and_levels(
|
|
|
51
65
|
f_bins = bin_ranks(f_ranks)
|
|
52
66
|
|
|
53
67
|
# Decide final level
|
|
54
|
-
final_levels = [
|
|
68
|
+
final_levels = [
|
|
69
|
+
decide_final_level(ob, fb, method=method, osman_weight=osman_weight)
|
|
70
|
+
for ob, fb in zip(o_bins, f_bins, strict=True)
|
|
71
|
+
]
|
|
55
72
|
|
|
56
73
|
return (o_ranks, f_ranks, final_levels)
|
|
57
74
|
|
|
@@ -111,55 +128,48 @@ def bin_ranks(ranks: list[int]) -> list[int]:
|
|
|
111
128
|
return bins
|
|
112
129
|
|
|
113
130
|
|
|
114
|
-
def decide_final_level(
|
|
131
|
+
def decide_final_level(
|
|
132
|
+
o_bin: int, f_bin: int, method: str = WEIGHTED, osman_weight: float = OSMAN_WEIGHT
|
|
133
|
+
) -> int:
|
|
115
134
|
"""
|
|
116
|
-
Decide final readability level from Osman and Flesch bins.
|
|
135
|
+
Decide final readability level from the Osman and Flesch bins.
|
|
136
|
+
|
|
137
|
+
Two strategies are available:
|
|
117
138
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
139
|
+
"weighted" (default): an Osman-dominant weighted average,
|
|
140
|
+
round(osman_weight * o_bin + (1 - osman_weight) * f_bin).
|
|
141
|
+
For Arabic, Osman is the reliable signal (it carries Arabic-specific terms
|
|
142
|
+
such as faseeh and complex/long-word ratios that hold up on undiacritised
|
|
143
|
+
text), whereas Flesch depends on syllable counts that degrade without
|
|
144
|
+
diacritics. Flesch therefore only nudges the level rather than overriding it.
|
|
123
145
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
- When metrics completely disagree, the text is unusual → mark as harder
|
|
128
|
-
- When metrics slightly disagree, compromise with average
|
|
146
|
+
"conservative": the legacy regime-split (Option B3) — trust Osman when it
|
|
147
|
+
indicates hardness (bins 3-4), trust Flesch when it indicates easiness
|
|
148
|
+
(bins 0-1), take the harder bin on large disagreement, else average.
|
|
129
149
|
|
|
130
150
|
Args:
|
|
131
151
|
o_bin: Osman bin (0-4, 0=easiest, 4=hardest)
|
|
132
152
|
f_bin: Flesch bin (0-4, 0=easiest, 4=hardest)
|
|
153
|
+
method: "weighted" or "conservative"
|
|
154
|
+
osman_weight: Weight on the Osman bin when method="weighted"
|
|
133
155
|
|
|
134
156
|
Returns:
|
|
135
157
|
Final level (0-4)
|
|
136
158
|
|
|
137
159
|
Examples:
|
|
138
|
-
>>> decide_final_level(4, 0)
|
|
139
|
-
4
|
|
140
|
-
>>> decide_final_level(0, 4) # Osman=easy, Flesch=hard → trust Flesch (unusual, conservative)
|
|
141
|
-
4
|
|
142
|
-
>>> decide_final_level(1, 0) # Both easy, Flesch=easier → trust Flesch
|
|
143
|
-
0
|
|
144
|
-
>>> decide_final_level(3, 4) # Both hard, Osman=easier → trust Osman
|
|
145
|
-
3
|
|
146
|
-
>>> decide_final_level(2, 3) # Small disagreement → average (2+3+1)//2 = 3
|
|
160
|
+
>>> decide_final_level(4, 0) # weighted, Osman dominates
|
|
147
161
|
3
|
|
162
|
+
>>> decide_final_level(0, 4, method="conservative") # easy: trust Flesch -> hard
|
|
163
|
+
4
|
|
148
164
|
"""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
# Complete disagreement (diff >= 2)
|
|
161
|
-
if diff >= 2:
|
|
162
|
-
return max(o_bin, f_bin)
|
|
163
|
-
|
|
164
|
-
# Small disagreement (diff = 1) or agreement
|
|
165
|
-
return (o_bin + f_bin + 1) // 2
|
|
165
|
+
if method == WEIGHTED:
|
|
166
|
+
return round(osman_weight * o_bin + (1 - osman_weight) * f_bin)
|
|
167
|
+
if method == CONSERVATIVE:
|
|
168
|
+
if o_bin >= 3:
|
|
169
|
+
return o_bin
|
|
170
|
+
if f_bin <= 1:
|
|
171
|
+
return f_bin
|
|
172
|
+
if abs(o_bin - f_bin) >= 2:
|
|
173
|
+
return max(o_bin, f_bin)
|
|
174
|
+
return (o_bin + f_bin + 1) // 2
|
|
175
|
+
raise ValueError(f"Unknown level method {method!r}; expected one of {LEVEL_METHODS}")
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Readability scoring
|
|
2
|
+
Readability scoring for Arabic text.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Osman scores come from the textstat library. Flesch Reading Ease uses an Arabic
|
|
5
|
+
syllable-based implementation (see arabic_flesch), since textstat's Flesch relies
|
|
6
|
+
on pyphen, which has no Arabic support.
|
|
5
7
|
"""
|
|
6
8
|
|
|
7
9
|
import textstat
|
|
8
10
|
|
|
11
|
+
from dalla_data_processing.readability.arabic_flesch import arabic_flesch_reading_ease
|
|
9
12
|
from dalla_data_processing.utils.logger import get_logger
|
|
10
13
|
|
|
11
14
|
logger = get_logger(__name__)
|
|
@@ -31,8 +34,7 @@ class ReadabilityScorer:
|
|
|
31
34
|
"""
|
|
32
35
|
Score text using both Flesch and Osman methods.
|
|
33
36
|
|
|
34
|
-
|
|
35
|
-
If Osman also fails, we use a simple fallback based on word length.
|
|
37
|
+
If both scores fail, fall back to a simple estimate based on word length.
|
|
36
38
|
|
|
37
39
|
Args:
|
|
38
40
|
text: Text to score
|
|
@@ -46,13 +48,8 @@ class ReadabilityScorer:
|
|
|
46
48
|
flesch_score = self._calculate_flesch(text)
|
|
47
49
|
osman_score = self._calculate_osman(text)
|
|
48
50
|
|
|
49
|
-
# If Flesch fails but Osman succeeds, use Osman for both
|
|
50
|
-
if flesch_score is None and osman_score is not None:
|
|
51
|
-
logger.info(f"Flesch failed, using Osman score ({osman_score:.1f}) for both metrics")
|
|
52
|
-
flesch_score = osman_score
|
|
53
|
-
|
|
54
51
|
# If both fail, use fallback as last resort
|
|
55
|
-
|
|
52
|
+
if flesch_score is None and osman_score is None:
|
|
56
53
|
flesch_fallback, osman_fallback = self._calculate_fallback_scores(text)
|
|
57
54
|
flesch_score = flesch_fallback
|
|
58
55
|
osman_score = osman_fallback
|
|
@@ -64,9 +61,9 @@ class ReadabilityScorer:
|
|
|
64
61
|
|
|
65
62
|
def _calculate_flesch(self, text: str) -> float | None:
|
|
66
63
|
"""
|
|
67
|
-
Calculate Flesch Reading Ease score.
|
|
64
|
+
Calculate Arabic Flesch Reading Ease score.
|
|
68
65
|
|
|
69
|
-
|
|
66
|
+
Higher scores indicate easier text (typically 0-100, but unbounded).
|
|
70
67
|
|
|
71
68
|
Args:
|
|
72
69
|
text: Text to score
|
|
@@ -75,7 +72,7 @@ class ReadabilityScorer:
|
|
|
75
72
|
Flesch score or None if error
|
|
76
73
|
"""
|
|
77
74
|
try:
|
|
78
|
-
score =
|
|
75
|
+
score = arabic_flesch_reading_ease(text)
|
|
79
76
|
if score is None:
|
|
80
77
|
logger.debug(f"Flesch score is None for text (length={len(text)})")
|
|
81
78
|
return None
|
|
@@ -44,14 +44,13 @@ _LATIN = r"a-zA-Z"
|
|
|
44
44
|
_DIGITS = r"0-9\u0660-\u0669\u06F0-\u06F9"
|
|
45
45
|
_COMPACT_CHARSET = _ARABIC + _LATIN + _DIGITS
|
|
46
46
|
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
_FULL_CHARSET = r"\w"
|
|
49
49
|
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
_COMPACT_RE = re.compile(f"[{_COMPACT_CHARSET}]+|[^{_COMPACT_CHARSET}\\s]|\\s+")
|
|
52
52
|
_COMPACT_SPLIT_RE = re.compile(f"[{_ARABIC}{_LATIN}]+|[{_DIGITS}]+|[^{_COMPACT_CHARSET}\\s]|\\s+")
|
|
53
53
|
|
|
54
|
-
# Pre-compiled regexes for full mode
|
|
55
54
|
_FULL_RE = re.compile(r"\w+|[^\w\s]|\s+")
|
|
56
55
|
_FULL_SPLIT_RE = re.compile(r"[^\W\d]+|\d+|[^\w\s]|\s+")
|
|
57
56
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dalla-data-processing
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
|
|
5
5
|
Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
|
|
6
6
|
License: CC-BY-NC-SA-4.0
|
|
@@ -57,7 +57,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
|
|
|
57
57
|
|
|
58
58
|
- **Linux**: Fully supported
|
|
59
59
|
- **macOS**: Fully supported (Intel or through rosetta)
|
|
60
|
-
- **Windows**: Supported through WSL
|
|
60
|
+
- **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
|
|
61
61
|
|
|
62
62
|
## Installation
|
|
63
63
|
|
|
@@ -98,7 +98,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
|
|
|
98
98
|
|
|
99
99
|
### Development Installation
|
|
100
100
|
|
|
101
|
-
<b>From Source (with uv
|
|
101
|
+
<b>From Source (with uv)</b>
|
|
102
102
|
|
|
103
103
|
```bash
|
|
104
104
|
git clone https://github.com/U4RASD/dalla-data-processing.git
|
|
@@ -148,6 +148,6 @@ Pack and prepare datasets for training.
|
|
|
148
148
|
|
|
149
149
|
## Links
|
|
150
150
|
|
|
151
|
-
- Homepage: https://
|
|
152
|
-
- Issues: https://github.com/U4RASD/dalla-data-processing/issues
|
|
151
|
+
- Homepage: https://acrps.ai
|
|
153
152
|
- Documentation: https://github.com/U4RASD/dalla-data-processing#readme
|
|
153
|
+
- ACRPS: https://acr.ps
|
|
@@ -65,6 +65,7 @@ dalla_data_processing/quality/__init__.py
|
|
|
65
65
|
dalla_data_processing/quality/checker.py
|
|
66
66
|
dalla_data_processing/readability/README.md
|
|
67
67
|
dalla_data_processing/readability/__init__.py
|
|
68
|
+
dalla_data_processing/readability/arabic_flesch.py
|
|
68
69
|
dalla_data_processing/readability/ranking.py
|
|
69
70
|
dalla_data_processing/readability/scorer.py
|
|
70
71
|
dalla_data_processing/stemming/README.md
|
|
@@ -12,13 +12,14 @@ NC='\033[0m' # No Color
|
|
|
12
12
|
|
|
13
13
|
echo -e "${GREEN}=== Building Onion Binary ===${NC}"
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
|
|
16
17
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
|
17
18
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
18
19
|
ONION_SOURCE="$PROJECT_ROOT/dalla_data_processing/deduplication/onion/src_sc"
|
|
19
20
|
OUTPUT_DIR="$PROJECT_ROOT/dalla_data_processing/deduplication/bin"
|
|
20
21
|
|
|
21
|
-
|
|
22
|
+
|
|
22
23
|
if [ ! -d "$ONION_SOURCE" ]; then
|
|
23
24
|
echo -e "${RED}Error: Onion source not found at $ONION_SOURCE${NC}"
|
|
24
25
|
exit 1
|
|
@@ -33,7 +34,7 @@ if ! command -v g++ &> /dev/null; then
|
|
|
33
34
|
exit 1
|
|
34
35
|
fi
|
|
35
36
|
|
|
36
|
-
|
|
37
|
+
|
|
37
38
|
echo -e "${YELLOW}Checking for Google sparsehash...${NC}"
|
|
38
39
|
if ! echo '#include <google/sparse_hash_set>' | g++ -x c++ -c - -o /dev/null 2>/dev/null; then
|
|
39
40
|
echo -e "${YELLOW}Warning: Google sparsehash headers not found${NC}"
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
# file generated by setuptools-scm
|
|
2
|
-
# don't change, don't track in version control
|
|
3
|
-
|
|
4
|
-
__all__ = [
|
|
5
|
-
"__version__",
|
|
6
|
-
"__version_tuple__",
|
|
7
|
-
"version",
|
|
8
|
-
"version_tuple",
|
|
9
|
-
"__commit_id__",
|
|
10
|
-
"commit_id",
|
|
11
|
-
]
|
|
12
|
-
|
|
13
|
-
TYPE_CHECKING = False
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from typing import Tuple
|
|
16
|
-
from typing import Union
|
|
17
|
-
|
|
18
|
-
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
-
COMMIT_ID = Union[str, None]
|
|
20
|
-
else:
|
|
21
|
-
VERSION_TUPLE = object
|
|
22
|
-
COMMIT_ID = object
|
|
23
|
-
|
|
24
|
-
version: str
|
|
25
|
-
__version__: str
|
|
26
|
-
__version_tuple__: VERSION_TUPLE
|
|
27
|
-
version_tuple: VERSION_TUPLE
|
|
28
|
-
commit_id: COMMIT_ID
|
|
29
|
-
__commit_id__: COMMIT_ID
|
|
30
|
-
|
|
31
|
-
__version__ = version = '0.0.11'
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 0, 11)
|
|
33
|
-
|
|
34
|
-
__commit_id__ = commit_id = 'g5e00041e7'
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/core/README.md
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/core/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/core/dataset.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/core/parallel.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.11 → dalla_data_processing-0.0.12}/dalla_data_processing/utils/logger.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|