diffinite 0.8.0__tar.gz → 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {diffinite-0.8.0/src/diffinite.egg-info → diffinite-0.9.1}/PKG-INFO +1 -1
  2. {diffinite-0.8.0 → diffinite-0.9.1}/pyproject.toml +1 -1
  3. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/cli.py +36 -0
  4. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/differ.py +32 -10
  5. diffinite-0.9.1/src/diffinite/languages/data.py +64 -0
  6. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/pipeline.py +25 -4
  7. {diffinite-0.8.0 → diffinite-0.9.1/src/diffinite.egg-info}/PKG-INFO +1 -1
  8. diffinite-0.8.0/src/diffinite/languages/data.py +0 -36
  9. {diffinite-0.8.0 → diffinite-0.9.1}/LICENSE +0 -0
  10. {diffinite-0.8.0 → diffinite-0.9.1}/NOTICE +0 -0
  11. {diffinite-0.8.0 → diffinite-0.9.1}/README.md +0 -0
  12. {diffinite-0.8.0 → diffinite-0.9.1}/setup.cfg +0 -0
  13. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/__init__.py +0 -0
  14. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/__main__.py +0 -0
  15. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/collector.py +0 -0
  16. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/deep_compare.py +0 -0
  17. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/evidence.py +0 -0
  18. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/fingerprint.py +0 -0
  19. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/__init__.py +0 -0
  20. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/_registry.py +0 -0
  21. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/_spec.py +0 -0
  22. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/c_family.py +0 -0
  23. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/csharp.py +0 -0
  24. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/go_rust_swift.py +0 -0
  25. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/java.py +0 -0
  26. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/javascript.py +0 -0
  27. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/markup.py +0 -0
  28. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/python.py +0 -0
  29. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/languages/scripting.py +0 -0
  30. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/models.py +0 -0
  31. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/parser.py +0 -0
  32. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite/pdf_gen.py +0 -0
  33. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/SOURCES.txt +0 -0
  34. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/dependency_links.txt +0 -0
  35. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/entry_points.txt +0 -0
  36. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/requires.txt +0 -0
  37. {diffinite-0.8.0 → diffinite-0.9.1}/src/diffinite.egg-info/top_level.txt +0 -0
  38. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_cli.py +0 -0
  39. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_collector.py +0 -0
  40. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_deep_compare.py +0 -0
  41. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_differ.py +0 -0
  42. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_differ_extended.py +0 -0
  43. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_evidence.py +0 -0
  44. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_evidence_hash.py +0 -0
  45. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_fingerprint.py +0 -0
  46. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_languages.py +0 -0
  47. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_normalize.py +0 -0
  48. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_parser.py +0 -0
  49. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_pdf_gen.py +0 -0
  50. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_pipeline.py +0 -0
  51. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_plagiarism_dataset.py +0 -0
  52. {diffinite-0.8.0 → diffinite-0.9.1}/tests/test_sqlite_integration.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffinite
3
- Version: 0.8.0
3
+ Version: 0.9.1
4
4
  Summary: Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit
5
5
  Author: nash-dir
6
6
  License: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "diffinite"
7
- version = "0.8.0"
7
+ version = "0.9.1"
8
8
  description = "Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit"
9
9
  readme = "README.md"
10
10
  license = {text = "Apache-2.0"}
@@ -95,6 +95,34 @@ def main(argv: list[str] | None = None) -> None:
95
95
  default=FUZZY_THRESHOLD,
96
96
  help=f"Fuzzy matching threshold 0–100 (default: {FUZZY_THRESHOLD})",
97
97
  )
98
+ parser.add_argument(
99
+ "--encoding",
100
+ default="auto",
101
+ help=(
102
+ "Source file encoding. 'auto' (default) uses charset-normalizer "
103
+ "auto-detection with Korean-optimized fallback (utf-8 -> euc-kr -> cp949). "
104
+ "Specify an explicit encoding (e.g. euc-kr, utf-8, cp949, shift_jis, "
105
+ "gb2312) to force-decode all files with that encoding."
106
+ ),
107
+ )
108
+ parser.add_argument(
109
+ "--sort-by",
110
+ choices=["filename", "size", "ratio"],
111
+ default=None,
112
+ dest="sort_by",
113
+ help=(
114
+ "Sort matched file pairs in the report. "
115
+ "'filename' sorts by file path, 'size' by file size, "
116
+ "'ratio' by similarity ratio. Default: insertion order (no sort)."
117
+ ),
118
+ )
119
+ parser.add_argument(
120
+ "--sort-order",
121
+ choices=["asc", "desc"],
122
+ default="asc",
123
+ dest="sort_order",
124
+ help="Sort direction (default: asc). Only effective with --sort-by.",
125
+ )
98
126
 
99
127
  # ── Output modes ──────────────────────────────────────────────────
100
128
  parser.add_argument(
@@ -280,6 +308,9 @@ def main(argv: list[str] | None = None) -> None:
280
308
  autojunk=not args.no_autojunk,
281
309
  )
282
310
 
311
+ # Resolve encoding
312
+ encoding = args.encoding if args.encoding.lower() != "auto" else None
313
+
283
314
  run_pipeline(
284
315
  dir_a=args.dir_a,
285
316
  dir_b=args.dir_b,
@@ -313,6 +344,11 @@ def main(argv: list[str] | None = None) -> None:
313
344
  report_html=args.report_html,
314
345
  report_md=args.report_md,
315
346
  report_json=args.report_json,
347
+ # Encoding
348
+ encoding=encoding,
349
+ # Sorting
350
+ sort_by=args.sort_by,
351
+ sort_order=args.sort_order,
316
352
  )
317
353
 
318
354
 
@@ -38,12 +38,17 @@ logger = logging.getLogger(__name__)
38
38
  # ──────────────────────────────────────────────────────────────────────
39
39
  # 인코딩 자동 감지 파일 리더
40
40
  # ──────────────────────────────────────────────────────────────────────
41
- def read_file(path: str) -> Optional[str]:
41
+ def read_file(path: str, encoding: str | None = None) -> Optional[str]:
42
42
  """파일을 읽고 인코딩을 자동 감지하여 유니코드 문자열로 반환한다.
43
43
 
44
44
  ``charset_normalizer.from_bytes()``는 BOM, 통계적 분석 등을 조합하여
45
45
  UTF-8, EUC-KR, Shift_JIS 등 다양한 인코딩을 감지한다.
46
46
 
47
+ Args:
48
+ path: 파일 경로.
49
+ encoding: 인코딩 지정. ``None`` 또는 ``"auto"``이면 자동 감지.
50
+ ``"euc-kr"``, ``"utf-8"`` 등 지정하면 해당 인코딩으로 강제 디코딩.
51
+
47
52
  Returns:
48
53
  디코딩된 텍스트. 빈 파일은 ``""``, 감지 실패 시 ``None``.
49
54
 
@@ -60,16 +65,33 @@ def read_file(path: str) -> Optional[str]:
60
65
  if not raw:
61
66
  return ""
62
67
 
63
- result = from_bytes(raw).best()
64
- if result is None:
65
- logger.warning("Could not detect encoding for %s — skipping", path)
66
- return None
68
+ # ── Manual encoding specified ────────────────────────────────
69
+ if encoding and encoding.lower() not in ("auto", ""):
70
+ try:
71
+ return raw.decode(encoding)
72
+ except (UnicodeDecodeError, LookupError) as exc:
73
+ logger.error("Decoding failed for %s with encoding %s: %s",
74
+ path, encoding, exc)
75
+ return None
67
76
 
68
- try:
69
- return str(result)
70
- except Exception as exc: # noqa: BLE001
71
- logger.error("Decoding failed for %s (%s): %s", path, result.encoding, exc)
72
- return None
77
+ # ── Auto-detect with charset_normalizer ──────────────────────
78
+ result = from_bytes(raw).best()
79
+ if result is not None:
80
+ try:
81
+ return str(result)
82
+ except Exception as exc: # noqa: BLE001
83
+ logger.warning("charset_normalizer decode failed for %s (%s): %s",
84
+ path, result.encoding, exc)
85
+
86
+ # ── Fallback chain (Korean-optimized) ────────────────────────
87
+ for fallback_enc in ("utf-8", "euc-kr", "cp949"):
88
+ try:
89
+ return raw.decode(fallback_enc)
90
+ except (UnicodeDecodeError, LookupError):
91
+ continue
92
+
93
+ logger.warning("Could not detect encoding for %s — skipping", path)
94
+ return None
73
95
 
74
96
 
75
97
  # ──────────────────────────────────────────────────────────────────────
@@ -0,0 +1,64 @@
1
+ """Data / query languages: SQL, YAML, TOML."""
2
+
3
+ from diffinite.languages._spec import LangSpec
4
+ from diffinite.languages._registry import register
5
+ from diffinite.models import CommentSpec
6
+
7
+ # ── SQL ───────────────────────────────────────────────────────────
8
+ register(LangSpec(
9
+ name="SQL",
10
+ extensions=(".sql", ".ddl", ".dml", ".pks", ".pkb", ".plsql", ".tsql"),
11
+ comment=CommentSpec(line_markers=("--",), block_start="/*", block_end="*/"),
12
+ keywords=frozenset({
13
+ # DML
14
+ "select", "from", "where", "insert", "update", "delete", "merge",
15
+ "into", "values", "set", "returning",
16
+ # DDL
17
+ "create", "drop", "alter", "table", "index", "view", "schema",
18
+ "database", "sequence", "trigger", "procedure", "function",
19
+ "column", "constraint", "primary", "foreign", "key", "references",
20
+ "unique", "check", "default", "auto_increment", "identity",
21
+ "cascade", "restrict", "truncate", "rename", "replace",
22
+ # Joins
23
+ "join", "inner", "outer", "left", "right", "cross", "full",
24
+ "natural", "on", "using",
25
+ # Operators & conditions
26
+ "and", "or", "not", "in", "like", "ilike", "between",
27
+ "is", "null", "true", "false", "case", "when", "then", "else",
28
+ # Aggregates & ordering
29
+ "order", "by", "group", "having", "limit", "offset", "fetch",
30
+ "as", "distinct", "union", "intersect", "except", "all", "exists",
31
+ "count", "sum", "avg", "min", "max",
32
+ # Window functions
33
+ "over", "partition", "row_number", "rank", "dense_rank",
34
+ "lead", "lag", "first_value", "last_value", "ntile",
35
+ "rows", "range", "unbounded", "preceding", "following", "current",
36
+ # Transactions & control flow
37
+ "begin", "end", "commit", "rollback", "savepoint", "transaction",
38
+ # DCL
39
+ "grant", "revoke", "deny",
40
+ # Subquery & CTE
41
+ "with", "recursive",
42
+ # Types
43
+ "int", "integer", "varchar", "char", "text", "boolean", "date",
44
+ "timestamp", "decimal", "numeric", "float", "double", "blob", "clob",
45
+ # Procedural (PL/SQL, T-SQL)
46
+ "declare", "variable", "cursor", "open", "close", "fetch",
47
+ "loop", "while", "for", "if", "elseif", "elsif", "return",
48
+ "exec", "execute", "call", "raise", "exception", "handler",
49
+ }),
50
+ ))
51
+
52
+ # ── YAML ──────────────────────────────────────────────────────────
53
+ register(LangSpec(
54
+ name="YAML",
55
+ extensions=(".yaml", ".yml"),
56
+ comment=CommentSpec(line_markers=("#",), block_start=None, block_end=None),
57
+ ))
58
+
59
+ # ── TOML ──────────────────────────────────────────────────────────
60
+ register(LangSpec(
61
+ name="TOML",
62
+ extensions=(".toml",),
63
+ comment=CommentSpec(line_markers=("#",), block_start=None, block_end=None),
64
+ ))
@@ -377,6 +377,11 @@ def run_pipeline(
377
377
  # Evidence integrity
378
378
  embed_hash: bool = False,
379
379
  bundle_path: str | None = None,
380
+ # Encoding
381
+ encoding: str | None = None,
382
+ # Sorting
383
+ sort_by: str | None = None,
384
+ sort_order: str = "asc",
380
385
  ) -> None:
381
386
  """Execute the full diff-to-report pipeline.
382
387
 
@@ -448,8 +453,8 @@ def run_pipeline(
448
453
  abs_b = str(root_b / m.rel_path_b)
449
454
  ext = Path(m.rel_path_a).suffix.lower()
450
455
 
451
- text_a = read_file(abs_a)
452
- text_b = read_file(abs_b)
456
+ text_a = read_file(abs_a, encoding=encoding)
457
+ text_b = read_file(abs_b, encoding=encoding)
453
458
 
454
459
  if text_a is None or text_b is None:
455
460
  results.append(DiffResult(
@@ -491,8 +496,8 @@ def run_pipeline(
491
496
  abs_b = str(root_b / m.rel_path_b)
492
497
  ext = Path(m.rel_path_a).suffix.lower()
493
498
 
494
- text_a = read_file(abs_a)
495
- text_b = read_file(abs_b)
499
+ text_a = read_file(abs_a, encoding=encoding)
500
+ text_b = read_file(abs_b, encoding=encoding)
496
501
  if text_a is None or text_b is None:
497
502
  continue
498
503
  if not compare_comment:
@@ -519,6 +524,22 @@ def run_pipeline(
519
524
 
520
525
  total_files = len(results)
521
526
 
527
+ # ── Sort results ──────────────────────────────────────────────
528
+ if sort_by:
529
+ reverse = sort_order == "desc"
530
+ if sort_by == "filename":
531
+ results.sort(key=lambda r: r.match.rel_path_a.lower(), reverse=reverse)
532
+ elif sort_by == "ratio":
533
+ results.sort(key=lambda r: r.ratio, reverse=reverse)
534
+ elif sort_by == "size":
535
+ def _file_size(r: DiffResult) -> int:
536
+ try:
537
+ return os.path.getsize(str(root_a / r.match.rel_path_a))
538
+ except OSError:
539
+ return 0
540
+ results.sort(key=_file_size, reverse=reverse)
541
+ logger.info(" Sorted by %s (%s)", sort_by, sort_order)
542
+
522
543
  # Deep Compare (only in deep mode)
523
544
  deep_results = None
524
545
  if exec_mode == "deep":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffinite
3
- Version: 0.8.0
3
+ Version: 0.9.1
4
4
  Summary: Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit
5
5
  Author: nash-dir
6
6
  License: Apache-2.0
@@ -1,36 +0,0 @@
1
- """Data / query languages: SQL, YAML, TOML."""
2
-
3
- from diffinite.languages._spec import LangSpec
4
- from diffinite.languages._registry import register
5
- from diffinite.models import CommentSpec
6
-
7
- # ── SQL ───────────────────────────────────────────────────────────
8
- register(LangSpec(
9
- name="SQL",
10
- extensions=(".sql",),
11
- comment=CommentSpec(line_markers=("--",), block_start="/*", block_end="*/"),
12
- keywords=frozenset({
13
- "select", "from", "where", "insert", "update", "delete",
14
- "create", "drop", "alter", "table", "index", "view",
15
- "join", "inner", "outer", "left", "right", "on",
16
- "and", "or", "not", "in", "like", "between",
17
- "order", "by", "group", "having", "limit", "offset",
18
- "as", "distinct", "union", "all", "exists",
19
- "null", "true", "false", "is",
20
- "begin", "end", "commit", "rollback", "transaction",
21
- }),
22
- ))
23
-
24
- # ── YAML ──────────────────────────────────────────────────────────
25
- register(LangSpec(
26
- name="YAML",
27
- extensions=(".yaml", ".yml"),
28
- comment=CommentSpec(line_markers=("#",), block_start=None, block_end=None),
29
- ))
30
-
31
- # ── TOML ──────────────────────────────────────────────────────────
32
- register(LangSpec(
33
- name="TOML",
34
- extensions=(".toml",),
35
- comment=CommentSpec(line_markers=("#",), block_start=None, block_end=None),
36
- ))
File without changes
File without changes
File without changes
File without changes
File without changes