diffinite 0.8.0__tar.gz → 0.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffinite-0.8.0/src/diffinite.egg-info → diffinite-0.9.1}/PKG-INFO +1 -1
- {diffinite-0.8.0 → diffinite-0.9.1}/pyproject.toml +1 -1
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/cli.py +36 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/differ.py +32 -10
- diffinite-0.9.1/src/diffinite/languages/data.py +64 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/pipeline.py +25 -4
- {diffinite-0.8.0 → diffinite-0.9.1/src/diffinite.egg-info}/PKG-INFO +1 -1
- diffinite-0.8.0/src/diffinite/languages/data.py +0 -36
- {diffinite-0.8.0 → diffinite-0.9.1}/LICENSE +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/NOTICE +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/README.md +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/setup.cfg +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/__init__.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/__main__.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/collector.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/deep_compare.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/evidence.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/fingerprint.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/__init__.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/_registry.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/_spec.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/c_family.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/csharp.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/go_rust_swift.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/java.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/javascript.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/markup.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/python.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/scripting.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/models.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/parser.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/pdf_gen.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/SOURCES.txt +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/dependency_links.txt +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/entry_points.txt +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/requires.txt +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/top_level.txt +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_cli.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_collector.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_deep_compare.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_differ.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_differ_extended.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_evidence.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_evidence_hash.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_fingerprint.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_languages.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_normalize.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_parser.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_pdf_gen.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_pipeline.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_plagiarism_dataset.py +0 -0
- {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_sqlite_integration.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "diffinite"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.9.1"
|
|
8
8
|
description = "Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "Apache-2.0"}
|
|
@@ -95,6 +95,34 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
95
95
|
default=FUZZY_THRESHOLD,
|
|
96
96
|
help=f"Fuzzy matching threshold 0–100 (default: {FUZZY_THRESHOLD})",
|
|
97
97
|
)
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"--encoding",
|
|
100
|
+
default="auto",
|
|
101
|
+
help=(
|
|
102
|
+
"Source file encoding. 'auto' (default) uses charset-normalizer "
|
|
103
|
+
"auto-detection with Korean-optimized fallback (utf-8 -> euc-kr -> cp949). "
|
|
104
|
+
"Specify an explicit encoding (e.g. euc-kr, utf-8, cp949, shift_jis, "
|
|
105
|
+
"gb2312) to force-decode all files with that encoding."
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
parser.add_argument(
|
|
109
|
+
"--sort-by",
|
|
110
|
+
choices=["filename", "size", "ratio"],
|
|
111
|
+
default=None,
|
|
112
|
+
dest="sort_by",
|
|
113
|
+
help=(
|
|
114
|
+
"Sort matched file pairs in the report. "
|
|
115
|
+
"'filename' sorts by file path, 'size' by file size, "
|
|
116
|
+
"'ratio' by similarity ratio. Default: insertion order (no sort)."
|
|
117
|
+
),
|
|
118
|
+
)
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
"--sort-order",
|
|
121
|
+
choices=["asc", "desc"],
|
|
122
|
+
default="asc",
|
|
123
|
+
dest="sort_order",
|
|
124
|
+
help="Sort direction (default: asc). Only effective with --sort-by.",
|
|
125
|
+
)
|
|
98
126
|
|
|
99
127
|
# ── Output modes ──────────────────────────────────────────────────
|
|
100
128
|
parser.add_argument(
|
|
@@ -280,6 +308,9 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
280
308
|
autojunk=not args.no_autojunk,
|
|
281
309
|
)
|
|
282
310
|
|
|
311
|
+
# Resolve encoding
|
|
312
|
+
encoding = args.encoding if args.encoding.lower() != "auto" else None
|
|
313
|
+
|
|
283
314
|
run_pipeline(
|
|
284
315
|
dir_a=args.dir_a,
|
|
285
316
|
dir_b=args.dir_b,
|
|
@@ -313,6 +344,11 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
313
344
|
report_html=args.report_html,
|
|
314
345
|
report_md=args.report_md,
|
|
315
346
|
report_json=args.report_json,
|
|
347
|
+
# Encoding
|
|
348
|
+
encoding=encoding,
|
|
349
|
+
# Sorting
|
|
350
|
+
sort_by=args.sort_by,
|
|
351
|
+
sort_order=args.sort_order,
|
|
316
352
|
)
|
|
317
353
|
|
|
318
354
|
|
|
@@ -38,12 +38,17 @@ logger = logging.getLogger(__name__)
|
|
|
38
38
|
# ──────────────────────────────────────────────────────────────────────
|
|
39
39
|
# 인코딩 자동 감지 파일 리더
|
|
40
40
|
# ──────────────────────────────────────────────────────────────────────
|
|
41
|
-
def read_file(path: str) -> Optional[str]:
|
|
41
|
+
def read_file(path: str, encoding: str | None = None) -> Optional[str]:
|
|
42
42
|
"""파일을 읽고 인코딩을 자동 감지하여 유니코드 문자열로 반환한다.
|
|
43
43
|
|
|
44
44
|
``charset_normalizer.from_bytes()``는 BOM, 통계적 분석 등을 조합하여
|
|
45
45
|
UTF-8, EUC-KR, Shift_JIS 등 다양한 인코딩을 감지한다.
|
|
46
46
|
|
|
47
|
+
Args:
|
|
48
|
+
path: 파일 경로.
|
|
49
|
+
encoding: 인코딩 지정. ``None`` 또는 ``"auto"``이면 자동 감지.
|
|
50
|
+
``"euc-kr"``, ``"utf-8"`` 등 지정하면 해당 인코딩으로 강제 디코딩.
|
|
51
|
+
|
|
47
52
|
Returns:
|
|
48
53
|
디코딩된 텍스트. 빈 파일은 ``""``, 감지 실패 시 ``None``.
|
|
49
54
|
|
|
@@ -60,16 +65,33 @@ def read_file(path: str) -> Optional[str]:
|
|
|
60
65
|
if not raw:
|
|
61
66
|
return ""
|
|
62
67
|
|
|
63
|
-
|
|
64
|
-
if
|
|
65
|
-
|
|
66
|
-
|
|
68
|
+
# ── Manual encoding specified ────────────────────────────────
|
|
69
|
+
if encoding and encoding.lower() not in ("auto", ""):
|
|
70
|
+
try:
|
|
71
|
+
return raw.decode(encoding)
|
|
72
|
+
except (UnicodeDecodeError, LookupError) as exc:
|
|
73
|
+
logger.error("Decoding failed for %s with encoding %s: %s",
|
|
74
|
+
path, encoding, exc)
|
|
75
|
+
return None
|
|
67
76
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
77
|
+
# ── Auto-detect with charset_normalizer ──────────────────────
|
|
78
|
+
result = from_bytes(raw).best()
|
|
79
|
+
if result is not None:
|
|
80
|
+
try:
|
|
81
|
+
return str(result)
|
|
82
|
+
except Exception as exc: # noqa: BLE001
|
|
83
|
+
logger.warning("charset_normalizer decode failed for %s (%s): %s",
|
|
84
|
+
path, result.encoding, exc)
|
|
85
|
+
|
|
86
|
+
# ── Fallback chain (Korean-optimized) ────────────────────────
|
|
87
|
+
for fallback_enc in ("utf-8", "euc-kr", "cp949"):
|
|
88
|
+
try:
|
|
89
|
+
return raw.decode(fallback_enc)
|
|
90
|
+
except (UnicodeDecodeError, LookupError):
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
logger.warning("Could not detect encoding for %s — skipping", path)
|
|
94
|
+
return None
|
|
73
95
|
|
|
74
96
|
|
|
75
97
|
# ──────────────────────────────────────────────────────────────────────
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Data / query languages: SQL, YAML, TOML."""
|
|
2
|
+
|
|
3
|
+
from diffinite.languages._spec import LangSpec
|
|
4
|
+
from diffinite.languages._registry import register
|
|
5
|
+
from diffinite.models import CommentSpec
|
|
6
|
+
|
|
7
|
+
# ── SQL ───────────────────────────────────────────────────────────
|
|
8
|
+
register(LangSpec(
|
|
9
|
+
name="SQL",
|
|
10
|
+
extensions=(".sql", ".ddl", ".dml", ".pks", ".pkb", ".plsql", ".tsql"),
|
|
11
|
+
comment=CommentSpec(line_markers=("--",), block_start="/*", block_end="*/"),
|
|
12
|
+
keywords=frozenset({
|
|
13
|
+
# DML
|
|
14
|
+
"select", "from", "where", "insert", "update", "delete", "merge",
|
|
15
|
+
"into", "values", "set", "returning",
|
|
16
|
+
# DDL
|
|
17
|
+
"create", "drop", "alter", "table", "index", "view", "schema",
|
|
18
|
+
"database", "sequence", "trigger", "procedure", "function",
|
|
19
|
+
"column", "constraint", "primary", "foreign", "key", "references",
|
|
20
|
+
"unique", "check", "default", "auto_increment", "identity",
|
|
21
|
+
"cascade", "restrict", "truncate", "rename", "replace",
|
|
22
|
+
# Joins
|
|
23
|
+
"join", "inner", "outer", "left", "right", "cross", "full",
|
|
24
|
+
"natural", "on", "using",
|
|
25
|
+
# Operators & conditions
|
|
26
|
+
"and", "or", "not", "in", "like", "ilike", "between",
|
|
27
|
+
"is", "null", "true", "false", "case", "when", "then", "else",
|
|
28
|
+
# Aggregates & ordering
|
|
29
|
+
"order", "by", "group", "having", "limit", "offset", "fetch",
|
|
30
|
+
"as", "distinct", "union", "intersect", "except", "all", "exists",
|
|
31
|
+
"count", "sum", "avg", "min", "max",
|
|
32
|
+
# Window functions
|
|
33
|
+
"over", "partition", "row_number", "rank", "dense_rank",
|
|
34
|
+
"lead", "lag", "first_value", "last_value", "ntile",
|
|
35
|
+
"rows", "range", "unbounded", "preceding", "following", "current",
|
|
36
|
+
# Transactions & control flow
|
|
37
|
+
"begin", "end", "commit", "rollback", "savepoint", "transaction",
|
|
38
|
+
# DCL
|
|
39
|
+
"grant", "revoke", "deny",
|
|
40
|
+
# Subquery & CTE
|
|
41
|
+
"with", "recursive",
|
|
42
|
+
# Types
|
|
43
|
+
"int", "integer", "varchar", "char", "text", "boolean", "date",
|
|
44
|
+
"timestamp", "decimal", "numeric", "float", "double", "blob", "clob",
|
|
45
|
+
# Procedural (PL/SQL, T-SQL)
|
|
46
|
+
"declare", "variable", "cursor", "open", "close", "fetch",
|
|
47
|
+
"loop", "while", "for", "if", "elseif", "elsif", "return",
|
|
48
|
+
"exec", "execute", "call", "raise", "exception", "handler",
|
|
49
|
+
}),
|
|
50
|
+
))
|
|
51
|
+
|
|
52
|
+
# ── YAML ──────────────────────────────────────────────────────────
|
|
53
|
+
register(LangSpec(
|
|
54
|
+
name="YAML",
|
|
55
|
+
extensions=(".yaml", ".yml"),
|
|
56
|
+
comment=CommentSpec(line_markers=("#",), block_start=None, block_end=None),
|
|
57
|
+
))
|
|
58
|
+
|
|
59
|
+
# ── TOML ──────────────────────────────────────────────────────────
|
|
60
|
+
register(LangSpec(
|
|
61
|
+
name="TOML",
|
|
62
|
+
extensions=(".toml",),
|
|
63
|
+
comment=CommentSpec(line_markers=("#",), block_start=None, block_end=None),
|
|
64
|
+
))
|
|
@@ -377,6 +377,11 @@ def run_pipeline(
|
|
|
377
377
|
# Evidence integrity
|
|
378
378
|
embed_hash: bool = False,
|
|
379
379
|
bundle_path: str | None = None,
|
|
380
|
+
# Encoding
|
|
381
|
+
encoding: str | None = None,
|
|
382
|
+
# Sorting
|
|
383
|
+
sort_by: str | None = None,
|
|
384
|
+
sort_order: str = "asc",
|
|
380
385
|
) -> None:
|
|
381
386
|
"""Execute the full diff-to-report pipeline.
|
|
382
387
|
|
|
@@ -448,8 +453,8 @@ def run_pipeline(
|
|
|
448
453
|
abs_b = str(root_b / m.rel_path_b)
|
|
449
454
|
ext = Path(m.rel_path_a).suffix.lower()
|
|
450
455
|
|
|
451
|
-
text_a = read_file(abs_a)
|
|
452
|
-
text_b = read_file(abs_b)
|
|
456
|
+
text_a = read_file(abs_a, encoding=encoding)
|
|
457
|
+
text_b = read_file(abs_b, encoding=encoding)
|
|
453
458
|
|
|
454
459
|
if text_a is None or text_b is None:
|
|
455
460
|
results.append(DiffResult(
|
|
@@ -491,8 +496,8 @@ def run_pipeline(
|
|
|
491
496
|
abs_b = str(root_b / m.rel_path_b)
|
|
492
497
|
ext = Path(m.rel_path_a).suffix.lower()
|
|
493
498
|
|
|
494
|
-
text_a = read_file(abs_a)
|
|
495
|
-
text_b = read_file(abs_b)
|
|
499
|
+
text_a = read_file(abs_a, encoding=encoding)
|
|
500
|
+
text_b = read_file(abs_b, encoding=encoding)
|
|
496
501
|
if text_a is None or text_b is None:
|
|
497
502
|
continue
|
|
498
503
|
if not compare_comment:
|
|
@@ -519,6 +524,22 @@ def run_pipeline(
|
|
|
519
524
|
|
|
520
525
|
total_files = len(results)
|
|
521
526
|
|
|
527
|
+
# ── Sort results ──────────────────────────────────────────────
|
|
528
|
+
if sort_by:
|
|
529
|
+
reverse = sort_order == "desc"
|
|
530
|
+
if sort_by == "filename":
|
|
531
|
+
results.sort(key=lambda r: r.match.rel_path_a.lower(), reverse=reverse)
|
|
532
|
+
elif sort_by == "ratio":
|
|
533
|
+
results.sort(key=lambda r: r.ratio, reverse=reverse)
|
|
534
|
+
elif sort_by == "size":
|
|
535
|
+
def _file_size(r: DiffResult) -> int:
|
|
536
|
+
try:
|
|
537
|
+
return os.path.getsize(str(root_a / r.match.rel_path_a))
|
|
538
|
+
except OSError:
|
|
539
|
+
return 0
|
|
540
|
+
results.sort(key=_file_size, reverse=reverse)
|
|
541
|
+
logger.info(" Sorted by %s (%s)", sort_by, sort_order)
|
|
542
|
+
|
|
522
543
|
# Deep Compare (only in deep mode)
|
|
523
544
|
deep_results = None
|
|
524
545
|
if exec_mode == "deep":
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
"""Data / query languages: SQL, YAML, TOML."""
|
|
2
|
-
|
|
3
|
-
from diffinite.languages._spec import LangSpec
|
|
4
|
-
from diffinite.languages._registry import register
|
|
5
|
-
from diffinite.models import CommentSpec
|
|
6
|
-
|
|
7
|
-
# ── SQL ───────────────────────────────────────────────────────────
|
|
8
|
-
register(LangSpec(
|
|
9
|
-
name="SQL",
|
|
10
|
-
extensions=(".sql",),
|
|
11
|
-
comment=CommentSpec(line_markers=("--",), block_start="/*", block_end="*/"),
|
|
12
|
-
keywords=frozenset({
|
|
13
|
-
"select", "from", "where", "insert", "update", "delete",
|
|
14
|
-
"create", "drop", "alter", "table", "index", "view",
|
|
15
|
-
"join", "inner", "outer", "left", "right", "on",
|
|
16
|
-
"and", "or", "not", "in", "like", "between",
|
|
17
|
-
"order", "by", "group", "having", "limit", "offset",
|
|
18
|
-
"as", "distinct", "union", "all", "exists",
|
|
19
|
-
"null", "true", "false", "is",
|
|
20
|
-
"begin", "end", "commit", "rollback", "transaction",
|
|
21
|
-
}),
|
|
22
|
-
))
|
|
23
|
-
|
|
24
|
-
# ── YAML ──────────────────────────────────────────────────────────
|
|
25
|
-
register(LangSpec(
|
|
26
|
-
name="YAML",
|
|
27
|
-
extensions=(".yaml", ".yml"),
|
|
28
|
-
comment=CommentSpec(line_markers=("#",), block_start=None, block_end=None),
|
|
29
|
-
))
|
|
30
|
-
|
|
31
|
-
# ── TOML ──────────────────────────────────────────────────────────
|
|
32
|
-
register(LangSpec(
|
|
33
|
-
name="TOML",
|
|
34
|
-
extensions=(".toml",),
|
|
35
|
-
comment=CommentSpec(line_markers=("#",), block_start=None, block_end=None),
|
|
36
|
-
))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|