github2gerrit 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,476 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: 2025 The Linux Foundation
3
+ #
4
+ # PR Content Filtering Module
5
+ #
6
+ # This module provides an extensible, rule-based system for filtering and cleaning
7
+ # GitHub pull request content for Gerrit consumption. It supports multiple automation
8
+ # tools (Dependabot, pre-commit.ci, etc.) with author-specific filtering rules.
9
+ #
10
+ # Key features:
11
+ # - Author-specific filtering rules
12
+ # - Extensible rule system for different automation tools
13
+ # - Configurable filtering options
14
+ # - Content deduplication between title and body
15
+ # - Emoji and formatting cleanup
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ import re
21
+ from abc import ABC
22
+ from abc import abstractmethod
23
+ from dataclasses import dataclass
24
+ from dataclasses import field
25
+ from typing import Any
26
+
27
+
28
+ log = logging.getLogger("github2gerrit.pr_content_filter")
29
+
30
+ # Common patterns used across filters
31
+ _HTML_DETAILS_PATTERN = re.compile(
32
+ r"<details[^>]*>\s*<summary[^>]*>(.*?)</summary>\s*(.*?)</details>", re.IGNORECASE | re.DOTALL
33
+ )
34
+ _MARKDOWN_LINK_PATTERN = re.compile(r"\[([^\]]+)\]\([^)]+\)")
35
+ _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
36
+ _MULTIPLE_NEWLINES_PATTERN = re.compile(r"\n{3,}")
37
+ _EMOJI_PATTERN = re.compile(r":[a-z_]+:") # GitHub emoji codes like :sparkles:
38
+
39
+
40
+ @dataclass
41
+ class FilterConfig:
42
+ """Configuration for PR content filtering."""
43
+
44
+ # Global options
45
+ enabled: bool = True
46
+ remove_emoji_codes: bool = True
47
+ deduplicate_title_in_body: bool = True
48
+
49
+ # Author-specific filtering
50
+ author_rules: dict[str, str] = field(default_factory=dict)
51
+
52
+ # Rule-specific configurations
53
+ dependabot_config: DependabotConfig = field(default_factory=lambda: DependabotConfig())
54
+ precommit_config: PrecommitConfig = field(default_factory=lambda: PrecommitConfig())
55
+
56
+
57
+ @dataclass
58
+ class DependabotConfig:
59
+ """Configuration for Dependabot PR filtering."""
60
+
61
+ enabled: bool = True
62
+ expand_release_notes: bool = True
63
+ expand_commits: bool = True
64
+ remove_compatibility_images: bool = True
65
+ remove_command_instructions: bool = True
66
+ truncate_at_commands: bool = True
67
+
68
+
69
+ @dataclass
70
+ class PrecommitConfig:
71
+ """Configuration for pre-commit.ci PR filtering."""
72
+
73
+ enabled: bool = True
74
+ # Future: add pre-commit.ci specific options
75
+
76
+
77
+ class FilterRule(ABC):
78
+ """Abstract base class for PR content filtering rules."""
79
+
80
+ @abstractmethod
81
+ def matches(self, title: str, body: str, author: str) -> bool:
82
+ """Check if this rule should be applied to the given PR."""
83
+
84
+ @abstractmethod
85
+ def apply(self, title: str, body: str, config: Any) -> str:
86
+ """Apply the filtering rule and return the cleaned body."""
87
+
88
+ @abstractmethod
89
+ def get_config_key(self) -> str:
90
+ """Return the configuration key for this rule."""
91
+
92
+
93
+ class DependabotRule(FilterRule):
94
+ """Filtering rule for Dependabot PRs."""
95
+
96
+ def matches(self, title: str, body: str, author: str) -> bool:
97
+ """Check if this is a Dependabot PR."""
98
+ if not body:
99
+ return False
100
+
101
+ dependabot_indicators = [
102
+ "dependabot" in author.lower(),
103
+ "Bumps " in title and " from " in title and " to " in title,
104
+ "Dependabot will resolve any conflicts" in body,
105
+ "<details>" in body and "<summary>" in body,
106
+ "camo.githubusercontent.com" in body,
107
+ ]
108
+
109
+ # Require multiple indicators for confidence
110
+ return sum(dependabot_indicators) >= 2
111
+
112
+ def get_config_key(self) -> str:
113
+ return "dependabot_config"
114
+
115
+ def apply(self, title: str, body: str, config: DependabotConfig) -> str:
116
+ """Apply Dependabot-specific filtering."""
117
+ if not config.enabled:
118
+ return body
119
+
120
+ log.info("Applying Dependabot filtering rules")
121
+ filtered = body
122
+
123
+ # Step 1: Expand collapsed sections
124
+ if config.expand_release_notes or config.expand_commits:
125
+ filtered = self._expand_html_details(filtered)
126
+
127
+ # Step 2: Remove compatibility images
128
+ if config.remove_compatibility_images:
129
+ filtered = self._remove_compatibility_images(filtered)
130
+
131
+ # Step 3: Truncate at command instructions
132
+ if config.truncate_at_commands:
133
+ filtered = self._truncate_at_dependabot_commands(filtered)
134
+
135
+ return filtered
136
+
137
+ def _expand_html_details(self, content: str) -> str:
138
+ """Expand HTML details/summary sections."""
139
+
140
+ def replace_details(match: re.Match[str]) -> str:
141
+ summary = match.group(1).strip()
142
+ detail_content = match.group(2).strip()
143
+
144
+ if summary:
145
+ return f"## {summary}\n\n{detail_content}\n"
146
+ return f"{detail_content}\n"
147
+
148
+ return _HTML_DETAILS_PATTERN.sub(replace_details, content)
149
+
150
+ def _remove_compatibility_images(self, content: str) -> str:
151
+ """Remove Dependabot compatibility score images."""
152
+ pattern = re.compile(r"!\[.*?\]\(https://camo\.githubusercontent\.com/[^)]*\)", re.IGNORECASE | re.DOTALL)
153
+ return pattern.sub("", content)
154
+
155
+ def _truncate_at_dependabot_commands(self, content: str) -> str:
156
+ """Truncate at Dependabot command instructions."""
157
+ pattern = re.compile(r"Dependabot will resolve any conflicts", re.IGNORECASE | re.MULTILINE)
158
+ match = pattern.search(content)
159
+ if match:
160
+ return content[: match.start()].rstrip()
161
+ return content
162
+
163
+
164
+ class PrecommitRule(FilterRule):
165
+ """Filtering rule for pre-commit.ci PRs."""
166
+
167
+ def matches(self, title: str, body: str, author: str) -> bool:
168
+ """Check if this is a pre-commit.ci PR."""
169
+ if not body:
170
+ return False
171
+
172
+ precommit_indicators = [
173
+ "pre-commit-ci" in author.lower(),
174
+ "pre-commit" in title.lower(),
175
+ "pre-commit.ci" in body,
176
+ "autoupdate" in title.lower(),
177
+ ]
178
+
179
+ return sum(precommit_indicators) >= 2
180
+
181
+ def get_config_key(self) -> str:
182
+ return "precommit_config"
183
+
184
+ def apply(self, title: str, body: str, config: PrecommitConfig) -> str:
185
+ """Apply pre-commit.ci specific filtering."""
186
+ if not config.enabled:
187
+ return body
188
+
189
+ log.info("Applying pre-commit.ci filtering rules")
190
+ # Future: implement pre-commit.ci specific filtering
191
+ return body
192
+
193
+
194
+ class PRContentFilter:
195
+ """Main PR content filtering engine."""
196
+
197
+ def __init__(self, config: FilterConfig | None = None):
198
+ """Initialize the filter with configuration."""
199
+ self.config = config or FilterConfig()
200
+ self.rules: list[FilterRule] = [
201
+ DependabotRule(),
202
+ PrecommitRule(),
203
+ ]
204
+
205
+ def should_filter(self, title: str, body: str, author: str) -> bool:
206
+ """Determine if PR content should be filtered."""
207
+ if not self.config.enabled or not body:
208
+ return False
209
+
210
+ # Check author-specific rules first
211
+ if author in self.config.author_rules:
212
+ rule_name = self.config.author_rules[author]
213
+ return any(rule.__class__.__name__.lower().startswith(rule_name.lower()) for rule in self.rules)
214
+
215
+ # Check if any rule matches
216
+ return any(rule.matches(title, body, author) for rule in self.rules)
217
+
218
+ def filter_content(self, title: str, body: str, author: str) -> str:
219
+ """Filter PR content based on configured rules."""
220
+ if not self.should_filter(title, body, author):
221
+ log.debug("No filtering rules matched for author: %s", author)
222
+ return body
223
+
224
+ filtered_body = body
225
+
226
+ # Apply global pre-processing first (including title deduplication)
227
+ filtered_body = self._pre_process(title, filtered_body)
228
+
229
+ # Apply matching rules
230
+ for rule in self.rules:
231
+ if rule.matches(title, body, author):
232
+ config_key = rule.get_config_key()
233
+ rule_config = getattr(self.config, config_key, None)
234
+ if rule_config:
235
+ filtered_body = rule.apply(title, filtered_body, rule_config)
236
+
237
+ # Apply global post-processing
238
+ filtered_body = self._post_process(title, filtered_body)
239
+
240
+ return filtered_body.strip()
241
+
242
+ def _pre_process(self, title: str, body: str) -> str:
243
+ """Apply global pre-processing rules."""
244
+ processed = body
245
+
246
+ # Remove title duplication first, before other processing
247
+ if self.config.deduplicate_title_in_body:
248
+ processed = self._remove_title_duplication(title, processed)
249
+
250
+ return processed
251
+
252
+ def _post_process(self, title: str, body: str) -> str:
253
+ """Apply global post-processing rules."""
254
+ processed = body
255
+
256
+ # Remove emoji codes
257
+ if self.config.remove_emoji_codes:
258
+ processed = self._remove_emoji_codes(processed)
259
+
260
+ # Clean HTML and markdown
261
+ processed = self._clean_html_and_markdown(processed)
262
+
263
+ # Clean up whitespace
264
+ processed = self._clean_whitespace(processed)
265
+
266
+ return processed
267
+
268
+ def _remove_emoji_codes(self, content: str) -> str:
269
+ """Remove GitHub emoji codes like :sparkles: and :bug:."""
270
+ # First handle emoji codes inside HTML tags to prevent leading spaces
271
+ # e.g., "<h3>:sparkles: New features</h3>" -> "<h3>New features</h3>"
272
+ content = re.sub(r"(<[^>]*>)\s*:[a-z_]+:\s*", r"\1", content)
273
+
274
+ # Remove emoji codes while preserving line structure
275
+ lines = content.splitlines()
276
+ cleaned_lines = []
277
+
278
+ for line in lines:
279
+ # Remove emoji codes from each line
280
+ cleaned_line = _EMOJI_PATTERN.sub("", line)
281
+ # Clean up multiple spaces that might result from emoji removal
282
+ cleaned_line = re.sub(r" +", " ", cleaned_line)
283
+
284
+ # Fix lines that started with emoji codes and now have leading space
285
+ if cleaned_line.startswith(" ") and not line.startswith(" "):
286
+ # This line originally started with an emoji, remove the leading space
287
+ cleaned_line = cleaned_line.lstrip()
288
+
289
+ # Fix markdown headers that lost their emoji but kept leading space
290
+ # e.g., "### :sparkles: New features" -> "### New features" not "### New features"
291
+ if cleaned_line.startswith("### "):
292
+ # Ensure exactly one space after ###
293
+ header_text = cleaned_line[4:].lstrip() # Remove ### and any spaces
294
+ cleaned_line = f"### {header_text}" if header_text else "###"
295
+ elif cleaned_line.startswith("## "):
296
+ # Same for ## headers
297
+ header_text = cleaned_line[3:].lstrip()
298
+ cleaned_line = f"## {header_text}" if header_text else "##"
299
+ elif cleaned_line.startswith("# "):
300
+ # Same for # headers
301
+ header_text = cleaned_line[2:].lstrip()
302
+ cleaned_line = f"# {header_text}" if header_text else "#"
303
+
304
+ # Strip trailing whitespace but preserve the line
305
+ cleaned_lines.append(cleaned_line.rstrip())
306
+
307
+ # Post-process to add missing line breaks after non-markdown headings
308
+ final_lines = []
309
+ for i, line in enumerate(cleaned_lines):
310
+ final_lines.append(line)
311
+
312
+ # If this line looks like a heading (not starting with #) and should have
313
+ # a blank line after it, add one
314
+ if (
315
+ i < len(cleaned_lines) - 1
316
+ and line.strip()
317
+ and not line.startswith("#")
318
+ and not line.startswith("-")
319
+ and not line.startswith(" ")
320
+ and not line.startswith("@")
321
+ and cleaned_lines[i + 1].strip()
322
+ and
323
+ # Check if this looks like a heading (common patterns)
324
+ (
325
+ line.endswith(("Changed", "Contributors"))
326
+ or "features" in line.lower()
327
+ or "fixes" in line.lower()
328
+ or "upgrades" in line.lower()
329
+ or "documentation" in line.lower()
330
+ )
331
+ ):
332
+ # Add blank line regardless of what follows
333
+ final_lines.append("")
334
+
335
+ return "\n".join(final_lines)
336
+
337
+ def _clean_html_and_markdown(self, content: str) -> str:
338
+ """Clean HTML tags and simplify markdown links."""
339
+ # Remove HTML tags
340
+ cleaned = _HTML_TAG_PATTERN.sub("", content)
341
+
342
+ # Simplify markdown links to just the text
343
+ cleaned = _MARKDOWN_LINK_PATTERN.sub(r"\1", cleaned)
344
+
345
+ return cleaned
346
+
347
+ def _remove_title_duplication(self, title: str, body: str) -> str:
348
+ """Remove duplication of title in body content."""
349
+ if not title or not body:
350
+ return body
351
+
352
+ lines = body.splitlines()
353
+ if not lines:
354
+ return body
355
+
356
+ # Find the first non-empty line
357
+ first_content_line_idx = None
358
+ first_content_line = ""
359
+
360
+ for i, line in enumerate(lines):
361
+ if line.strip():
362
+ first_content_line_idx = i
363
+ first_content_line = line.strip()
364
+ break
365
+
366
+ if first_content_line_idx is None:
367
+ return body # No content found
368
+
369
+ # Clean both title and first content line for comparison
370
+ title_clean = self._clean_for_comparison(title)
371
+ first_line_clean = self._clean_for_comparison(first_content_line)
372
+
373
+ # Handle common variations:
374
+ # - Exact match after cleaning
375
+ # - Body starts with "Bumps ..." when title is "Bump ..."
376
+ is_duplicate = first_line_clean == title_clean or (
377
+ title_clean.startswith("Bump ")
378
+ and first_line_clean.startswith("Bumps ")
379
+ and title_clean[5:] == first_line_clean[6:]
380
+ )
381
+
382
+ if is_duplicate:
383
+ # Remove the duplicate line and any immediately following empty lines
384
+ remaining_lines = lines[first_content_line_idx + 1 :]
385
+ while remaining_lines and not remaining_lines[0].strip():
386
+ remaining_lines = remaining_lines[1:]
387
+ return "\n".join(remaining_lines)
388
+
389
+ return body
390
+
391
+ def _clean_for_comparison(self, text: str) -> str:
392
+ """Clean text for comparison by removing markdown and punctuation."""
393
+ # Remove markdown links
394
+ cleaned = _MARKDOWN_LINK_PATTERN.sub(r"\1", text)
395
+ # Remove trailing periods and normalize spacing
396
+ cleaned = cleaned.strip().rstrip(".")
397
+ return cleaned
398
+
399
+ def _clean_whitespace(self, content: str) -> str:
400
+ """Clean up excessive whitespace."""
401
+ # Strip trailing whitespace from each line
402
+ lines = [line.rstrip() for line in content.splitlines()]
403
+ cleaned = "\n".join(lines)
404
+
405
+ # Reduce multiple newlines to at most 2
406
+ cleaned = _MULTIPLE_NEWLINES_PATTERN.sub("\n\n", cleaned)
407
+
408
+ return cleaned
409
+
410
+ def add_rule(self, rule: FilterRule) -> None:
411
+ """Add a custom filtering rule."""
412
+ self.rules.append(rule)
413
+
414
+ def set_author_rule(self, author: str, rule_name: str) -> None:
415
+ """Set a specific rule for an author."""
416
+ self.config.author_rules[author] = rule_name
417
+
418
+
419
+ # Convenience functions for backward compatibility and simple usage
420
+ def create_default_filter() -> PRContentFilter:
421
+ """Create a filter with default configuration."""
422
+ config = FilterConfig()
423
+
424
+ # Set up default author mappings
425
+ config.author_rules.update(
426
+ {
427
+ "dependabot[bot]": "dependabot",
428
+ "dependabot": "dependabot",
429
+ "pre-commit-ci[bot]": "precommit",
430
+ "pre-commit-ci": "precommit",
431
+ }
432
+ )
433
+
434
+ return PRContentFilter(config)
435
+
436
+
437
+ def filter_pr_body(title: str, body: str | None, author: str | None = None) -> str:
438
+ """
439
+ Main entry point for PR body filtering with default configuration.
440
+
441
+ Args:
442
+ title: PR title
443
+ body: PR body
444
+ author: PR author
445
+
446
+ Returns:
447
+ Filtered body, or original body if no filtering needed
448
+ """
449
+ if not body:
450
+ return body or ""
451
+
452
+ filter_engine = create_default_filter()
453
+ return filter_engine.filter_content(title, body, author or "")
454
+
455
+
456
+ # Legacy compatibility functions
457
+ def should_filter_pr_body(title: str, body: str | None, author: str | None = None) -> bool:
458
+ """Legacy function for checking if filtering should be applied."""
459
+ if not body:
460
+ return False
461
+
462
+ filter_engine = create_default_filter()
463
+ return filter_engine.should_filter(title, body, author or "")
464
+
465
+
466
+ def filter_dependabot_pr_body(body: str | None) -> str:
467
+ """Legacy function for Dependabot-specific filtering."""
468
+ if not body:
469
+ return ""
470
+
471
+ # Create a Dependabot-only filter
472
+ config = FilterConfig()
473
+
474
+ # Force apply Dependabot rule
475
+ rule = DependabotRule()
476
+ return rule.apply("", str(body), config.dependabot_config)