archsight 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -5
  3. data/lib/archsight/analysis/executor.rb +112 -0
  4. data/lib/archsight/analysis/result.rb +174 -0
  5. data/lib/archsight/analysis/sandbox.rb +319 -0
  6. data/lib/archsight/analysis.rb +11 -0
  7. data/lib/archsight/annotations/architecture_annotations.rb +2 -2
  8. data/lib/archsight/cli.rb +163 -0
  9. data/lib/archsight/database.rb +6 -2
  10. data/lib/archsight/helpers/analysis_renderer.rb +83 -0
  11. data/lib/archsight/helpers/formatting.rb +95 -0
  12. data/lib/archsight/helpers.rb +20 -4
  13. data/lib/archsight/import/concurrent_progress.rb +341 -0
  14. data/lib/archsight/import/executor.rb +466 -0
  15. data/lib/archsight/import/git_analytics.rb +626 -0
  16. data/lib/archsight/import/handler.rb +263 -0
  17. data/lib/archsight/import/handlers/github.rb +161 -0
  18. data/lib/archsight/import/handlers/gitlab.rb +202 -0
  19. data/lib/archsight/import/handlers/jira_base.rb +189 -0
  20. data/lib/archsight/import/handlers/jira_discover.rb +161 -0
  21. data/lib/archsight/import/handlers/jira_metrics.rb +179 -0
  22. data/lib/archsight/import/handlers/openapi_schema_parser.rb +279 -0
  23. data/lib/archsight/import/handlers/repository.rb +439 -0
  24. data/lib/archsight/import/handlers/rest_api.rb +293 -0
  25. data/lib/archsight/import/handlers/rest_api_index.rb +183 -0
  26. data/lib/archsight/import/progress.rb +91 -0
  27. data/lib/archsight/import/registry.rb +54 -0
  28. data/lib/archsight/import/shared_file_writer.rb +67 -0
  29. data/lib/archsight/import/team_matcher.rb +195 -0
  30. data/lib/archsight/import.rb +14 -0
  31. data/lib/archsight/resources/analysis.rb +91 -0
  32. data/lib/archsight/resources/application_component.rb +2 -2
  33. data/lib/archsight/resources/application_service.rb +12 -12
  34. data/lib/archsight/resources/business_product.rb +12 -12
  35. data/lib/archsight/resources/data_object.rb +1 -1
  36. data/lib/archsight/resources/import.rb +79 -0
  37. data/lib/archsight/resources/technology_artifact.rb +23 -2
  38. data/lib/archsight/version.rb +1 -1
  39. data/lib/archsight/web/api/docs.rb +17 -0
  40. data/lib/archsight/web/api/json_helpers.rb +164 -0
  41. data/lib/archsight/web/api/openapi/spec.yaml +500 -0
  42. data/lib/archsight/web/api/routes.rb +101 -0
  43. data/lib/archsight/web/application.rb +66 -43
  44. data/lib/archsight/web/doc/import.md +458 -0
  45. data/lib/archsight/web/doc/index.md.erb +1 -0
  46. data/lib/archsight/web/public/css/artifact.css +10 -0
  47. data/lib/archsight/web/public/css/graph.css +14 -0
  48. data/lib/archsight/web/public/css/instance.css +489 -0
  49. data/lib/archsight/web/views/api_docs.erb +19 -0
  50. data/lib/archsight/web/views/partials/artifact/_project_estimate.haml +14 -8
  51. data/lib/archsight/web/views/partials/instance/_analysis_detail.haml +74 -0
  52. data/lib/archsight/web/views/partials/instance/_analysis_result.haml +64 -0
  53. data/lib/archsight/web/views/partials/instance/_detail.haml +7 -3
  54. data/lib/archsight/web/views/partials/instance/_import_detail.haml +87 -0
  55. data/lib/archsight/web/views/partials/instance/_relations.haml +4 -4
  56. data/lib/archsight/web/views/partials/layout/_content.haml +4 -0
  57. data/lib/archsight/web/views/partials/layout/_navigation.haml +6 -5
  58. metadata +78 -1
@@ -0,0 +1,626 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time"
4
+ require "open3"
5
+ require "archsight/import"
6
+
7
+ # Repository health metrics analyzer (human activity only)
8
+ #
9
+ # Analyzes git repositories to extract:
10
+ # - Commits, contributors, top contributors (full history for team matching)
11
+ # - Recent tags (last 2 years)
12
+ # - Bus factor risk (low / medium / high / unknown)
13
+ # - Activity status (active / bot-only / abandoned)
14
+ # - Deployment types, workflow platforms, OCI images
15
+ # - Agentic tools configuration
16
+ # - README description and documentation links
17
+ #
18
+ # @example
19
+ # analytics = Archsight::Import::GitAnalytics.new("/path/to/repo")
20
+ # result = analytics.analyze
21
+ class Archsight::Import::GitAnalytics
22
+ DEFAULT_SINCE_DAYS = 180
23
+ DEFAULT_HIGH_THRESH = 0.75
24
+ DEFAULT_MED_THRESH = 0.50
25
+
26
+ IGNORED_BOTS = [
27
+ /dependabot/i,
28
+ /renovate\[bot\]/i,
29
+ /greenkeeper/i,
30
+ /ci\s+bot/i
31
+ ].freeze
32
+
33
+ AGENTIC_FILES = {
34
+ "claude" => %w[claude.md .claude.md docs/claude.md CLAUDE.md],
35
+ "cursor" => %w[.cursorrules .cursor/rules cursor.md],
36
+ "aider" => %w[.aider.conf.yml aider.md docs/aider.md],
37
+ "github-copilot" => %w[.github/copilot-instructions.md],
38
+ "agents" => %w[agents.md .agents.md docs/agents.md]
39
+ }.freeze
40
+
41
+ def initialize(repo_path, options = {})
42
+ @repo_path = repo_path
43
+ @since_days = options[:since_days] || DEFAULT_SINCE_DAYS
44
+ @high_thresh = options[:high_thresh] || DEFAULT_HIGH_THRESH
45
+ @med_thresh = options[:med_thresh] || DEFAULT_MED_THRESH
46
+ @since_iso = (Time.now - (@since_days * 86_400)).utc.iso8601
47
+ end
48
+
49
+ def analyze
50
+ {
51
+ "commits" => commit_count,
52
+ "commits_per_month" => commits_per_month,
53
+ "contributors" => contributor_count,
54
+ "contributors_6m" => contributors_6m_unique,
55
+ "contributors_per_month" => contributors_per_month,
56
+ "top_contributors" => top_contributors,
57
+ "recent_tags" => recent_tags,
58
+ "activity_status" => activity_status,
59
+ "created_at" => created_at,
60
+ "last_human_commit" => last_human_commit,
61
+ "bus_factor_risk" => bus_factor_risk,
62
+ "agentic_tools" => agentic_tools,
63
+ "deployment_types" => deployment_types,
64
+ "workflow_platforms" => workflow_platforms,
65
+ "workflow_types" => workflow_types,
66
+ "oci_images" => oci_images,
67
+ "description" => description,
68
+ "documentation_links" => documentation_links
69
+ }
70
+ end
71
+
72
+ private
73
+
74
+ # Run a git command inside the repo
75
+ def git(*git_args)
76
+ cmd = ["git", "-C", @repo_path] + git_args
77
+ out, err, status = Open3.capture3(*cmd)
78
+ raise "git failed: #{cmd.join(" ")}\n#{err}" unless status.success?
79
+
80
+ out.force_encoding("UTF-8").encode("UTF-8", invalid: :replace, undef: :replace, replace: "?").strip
81
+ end
82
+
83
+ # Check if author string matches bot patterns
84
+ def bot?(author_str)
85
+ IGNORED_BOTS.any? { |re| author_str =~ re }
86
+ end
87
+
88
+ # Determine the most recent ref (local or remote)
89
+ def most_recent_ref
90
+ @most_recent_ref ||= find_most_recent_ref
91
+ end
92
+
93
+ def find_most_recent_ref
94
+ refs = git(
95
+ "for-each-ref",
96
+ "--sort=-committerdate",
97
+ "--format=%(refname:short)",
98
+ "refs/heads/",
99
+ "refs/remotes/"
100
+ ).split("\n")
101
+
102
+ ref = refs.find { |r| !r.empty? }
103
+ return ref if ref && !ref.empty?
104
+
105
+ %w[main master].each do |candidate|
106
+ return candidate if git("rev-parse", "--verify", candidate)
107
+ rescue StandardError
108
+ nil
109
+ end
110
+
111
+ "HEAD"
112
+ rescue StandardError
113
+ "HEAD"
114
+ end
115
+
116
+ # Get all commits from full history (raw, including bots)
117
+ def raw_commit_lines
118
+ @raw_commit_lines ||= git(
119
+ "log",
120
+ most_recent_ref,
121
+ "--no-merges",
122
+ "--pretty=format:%H|%an|%ae",
123
+ "--"
124
+ ).split("\n").map { |line| line.split("|", 3) }
125
+ end
126
+
127
+ # Get recent commits (within since_days window)
128
+ def recent_commit_lines
129
+ @recent_commit_lines ||= git(
130
+ "log",
131
+ most_recent_ref,
132
+ "--since=#{@since_iso}",
133
+ "--no-merges",
134
+ "--pretty=format:%H|%an|%ae",
135
+ "--"
136
+ ).split("\n").map { |line| line.split("|", 3) }
137
+ end
138
+
139
+ # Get human-only commits from full history
140
+ def human_commits
141
+ @human_commits ||= raw_commit_lines.reject { |_, author, _| bot?(author) }
142
+ end
143
+
144
+ # Get recent human-only commits for activity status
145
+ def recent_human_commits
146
+ @recent_human_commits ||= recent_commit_lines.reject { |_, author, _| bot?(author) }
147
+ end
148
+
149
+ # Get commits from the last 6 months (for bus factor calculation)
150
+ def commits_6m
151
+ @commits_6m ||= begin
152
+ six_months_ago = (Time.now - (6 * 30 * 24 * 60 * 60)).strftime("%Y-%m-%d")
153
+ git(
154
+ "log",
155
+ most_recent_ref,
156
+ "--since=#{six_months_ago}",
157
+ "--no-merges",
158
+ "--pretty=format:%H|%an|%ae",
159
+ "--"
160
+ ).split("\n").map { |line| line.split("|", 3) }
161
+ end
162
+ end
163
+
164
+ # Get human-only commits from the last 6 months
165
+ def human_commits_6m
166
+ @human_commits_6m ||= commits_6m.reject { |_, author, _| bot?(author) }
167
+ end
168
+
169
+ def commit_count
170
+ human_commits.size
171
+ end
172
+
173
+ def last_human_commit
174
+ return nil if human_commits.empty?
175
+
176
+ most_recent_hash = human_commits.first.first
177
+ git("show", "-s", "--format=%cI", most_recent_hash)
178
+ end
179
+
180
+ def created_at
181
+ return nil if raw_commit_lines.empty?
182
+
183
+ oldest_hash = raw_commit_lines.last.first
184
+ git("show", "-s", "--format=%cI", oldest_hash)
185
+ rescue StandardError
186
+ nil
187
+ end
188
+
189
+ def commits_per_month
190
+ return [] if raw_commit_lines.empty?
191
+
192
+ dates_output = git(
193
+ "log",
194
+ most_recent_ref,
195
+ "--no-merges",
196
+ "--pretty=format:%cI",
197
+ "--"
198
+ )
199
+ return [] if dates_output.empty?
200
+
201
+ commit_dates = dates_output.split("\n").filter_map do |d|
202
+ Time.parse(d)
203
+ rescue StandardError
204
+ nil
205
+ end
206
+ return [] if commit_dates.empty?
207
+
208
+ counts_by_month = commit_dates.each_with_object(Hash.new(0)) do |date, h|
209
+ key = date.strftime("%Y-%m")
210
+ h[key] += 1
211
+ end
212
+
213
+ first_month = commit_dates.min.strftime("%Y-%m")
214
+ last_month = Time.now.strftime("%Y-%m")
215
+
216
+ all_months = generate_month_range(first_month, last_month)
217
+ all_months.map { |m| counts_by_month[m] || 0 }
218
+ end
219
+
220
+ def generate_month_range(start_month, end_month)
221
+ start_year, start_mon = start_month.split("-").map(&:to_i)
222
+ end_year, end_mon = end_month.split("-").map(&:to_i)
223
+
224
+ months = []
225
+ year = start_year
226
+ mon = start_mon
227
+ while year < end_year || (year == end_year && mon <= end_mon)
228
+ months << format("%04d-%02d", year, mon)
229
+ mon += 1
230
+ if mon > 12
231
+ mon = 1
232
+ year += 1
233
+ end
234
+ end
235
+ months
236
+ end
237
+
238
+ def contributor_count
239
+ contrib_counter.size
240
+ end
241
+
242
+ def contributors_per_month
243
+ return [] if human_commits.empty?
244
+
245
+ dates_output = git(
246
+ "log",
247
+ most_recent_ref,
248
+ "--no-merges",
249
+ "--pretty=format:%cI|%an|%ae",
250
+ "--"
251
+ )
252
+ return [] if dates_output.empty?
253
+
254
+ commits_with_dates = dates_output.split("\n").filter_map do |line|
255
+ parts = line.split("|", 3)
256
+ next nil if parts.length < 3
257
+
258
+ date_str, author_name, author_email = parts
259
+ next nil if bot?(author_name)
260
+
261
+ date = begin
262
+ Time.parse(date_str)
263
+ rescue StandardError
264
+ nil
265
+ end
266
+ next nil unless date
267
+
268
+ { date: date, author: "#{author_name}|#{author_email}" }
269
+ end
270
+
271
+ return [] if commits_with_dates.empty?
272
+
273
+ contributors_by_month = commits_with_dates.each_with_object(Hash.new { |h, k| h[k] = Set.new }) do |commit, h|
274
+ key = commit[:date].strftime("%Y-%m")
275
+ h[key] << commit[:author]
276
+ end
277
+
278
+ first_month = commits_with_dates.map { |c| c[:date] }.min.strftime("%Y-%m")
279
+ last_month = Time.now.strftime("%Y-%m")
280
+
281
+ all_months = generate_month_range(first_month, last_month)
282
+ all_months.map { |m| contributors_by_month[m]&.size || 0 }
283
+ end
284
+
285
+ def contrib_counter
286
+ @contrib_counter ||= human_commits.each_with_object(Hash.new { |h, k| h[k] = 0 }) do |(_, name, email), h|
287
+ key = [name, email]
288
+ h[key] += 1
289
+ end
290
+ end
291
+
292
+ def top_contributors
293
+ contrib_counter
294
+ .sort_by { |_key, cnt| -cnt }
295
+ .first(50)
296
+ .map { |(name, email), cnt| { "name" => name, "email" => email, "commits" => cnt } }
297
+ end
298
+
299
+ def recent_tags
300
+ @recent_tags ||= begin
301
+ two_years_ago = (Time.now - (730 * 86_400)).utc.iso8601
302
+ raw_tags = git(
303
+ "for-each-ref",
304
+ "--sort=-creatordate",
305
+ "--format=%(refname:short) %(creatordate:iso8601)",
306
+ "refs/tags"
307
+ )
308
+
309
+ raw_tags.each_line
310
+ .map { |l| l.split(" ", 2) }
311
+ .select { |_, date| date && date >= two_years_ago }
312
+ .map { |name, date| { "name" => name, "date" => date.chomp } }
313
+ .first(10)
314
+ end
315
+ end
316
+
317
+ def activity_status
318
+ if recent_human_commits.empty? && recent_commit_lines.empty?
319
+ "abandoned"
320
+ elsif recent_human_commits.empty?
321
+ "bot-only"
322
+ else
323
+ "active"
324
+ end
325
+ end
326
+
327
+ def contributors_6m_unique
328
+ @contributors_6m_unique ||= calculate_contributors_6m_unique
329
+ end
330
+
331
+ def calculate_contributors_6m_unique
332
+ return 0 if human_commits_6m.empty?
333
+
334
+ human_commits_6m.map { |_, author, _| author }.uniq.size
335
+ end
336
+
337
+ def bus_factor_risk
338
+ return "unknown" if human_commits_6m.empty?
339
+
340
+ commits_by_author = Hash.new(0)
341
+ human_commits_6m.each { |_, author, _| commits_by_author[author] += 1 }
342
+
343
+ total_6m = commits_by_author.values.sum
344
+ top_6m = commits_by_author.values.max
345
+
346
+ share = total_6m.zero? ? 0.0 : top_6m.to_f / total_6m
347
+
348
+ if share > @high_thresh
349
+ "high"
350
+ elsif share > @med_thresh
351
+ "medium"
352
+ else
353
+ "low"
354
+ end
355
+ end
356
+
357
+ def agentic_tools
358
+ tools = []
359
+
360
+ AGENTIC_FILES.each do |tool, files|
361
+ files.each do |file|
362
+ if File.exist?(File.join(@repo_path, file))
363
+ tools << tool
364
+ break
365
+ end
366
+ end
367
+ end
368
+
369
+ tools.uniq!
370
+ tools.empty? ? "none" : tools.join(",")
371
+ end
372
+
373
+ def deployment_types
374
+ types = []
375
+ types << "container" if File.exist?(File.join(@repo_path, "Dockerfile"))
376
+ types << "chart" if Dir.exist?(File.join(@repo_path, "charts")) || Dir.exist?(File.join(@repo_path, "helm"))
377
+ types << "debian" if File.exist?(File.join(@repo_path, "debian/control"))
378
+ types << "rpm" if File.exist?(File.join(@repo_path, ".spec")) || Dir.glob(File.join(@repo_path, "*.spec")).any?
379
+
380
+ makefile_path = File.join(@repo_path, "Makefile")
381
+ if File.exist?(makefile_path)
382
+ makefile_content = File.read(makefile_path)
383
+ types << "binary" if makefile_content.match?(/\bbuild\b/i)
384
+ end
385
+
386
+ types << "none" if types.empty?
387
+ types.join(",")
388
+ end
389
+
390
+ def oci_images
391
+ @oci_images ||= begin
392
+ images = []
393
+
394
+ # Search GitHub Actions workflows
395
+ workflows_dir = File.join(@repo_path, ".github/workflows")
396
+ if Dir.exist?(workflows_dir)
397
+ Dir.glob(File.join(workflows_dir, "*.{yml,yaml}")).each do |workflow_file|
398
+ images.concat(extract_oci_images_from_file(workflow_file))
399
+ end
400
+ end
401
+
402
+ # Search GitLab CI
403
+ gitlab_ci = File.join(@repo_path, ".gitlab-ci.yml")
404
+ images.concat(extract_oci_images_from_file(gitlab_ci)) if File.exist?(gitlab_ci)
405
+
406
+ # Infer from Dockerfile if no explicit references found
407
+ if deployment_types.include?("container") && images.empty?
408
+ repo_name = File.basename(@repo_path)
409
+ images << "ghcr.io/ionos-cloud/#{repo_name}" if @repo_path.include?("ionos-cloud") || @repo_path.include?("github.com")
410
+ end
411
+
412
+ images.uniq
413
+ end
414
+ end
415
+
416
+ def extract_oci_images_from_file(file_path)
417
+ return [] unless File.exist?(file_path)
418
+
419
+ images = []
420
+ content = File.read(file_path)
421
+
422
+ # Pattern 1: images: ghcr.io/ionos-cloud/repo-name or harbor...
423
+ content.scan(/images:\s*[|\n]\s*([^\s]+(?:ghcr\.io|harbor)[^\s]+)/m).flatten.each do |img|
424
+ img.split("\n").each do |line|
425
+ line = line.strip
426
+ next if line.empty? || line.start_with?("type=")
427
+
428
+ images << line if line.match?(/ghcr\.io|harbor/)
429
+ end
430
+ end
431
+
432
+ # Pattern 2: Direct image references
433
+ content.scan(%r{(?:ghcr\.io|harbor[^\s]*)/([^\s:]+)}).flatten.each do |path|
434
+ images << "ghcr.io/#{path}" unless images.any? { |img| img.include?(path) }
435
+ end
436
+
437
+ images
438
+ end
439
+
440
+ def workflow_platforms
441
+ platforms = []
442
+ platforms << "github-actions" if Dir.exist?(File.join(@repo_path, ".github/workflows"))
443
+ platforms << "gitlab-ci" if File.exist?(File.join(@repo_path, ".gitlab-ci.yml"))
444
+ platforms << "makefile" if File.exist?(File.join(@repo_path, "Makefile"))
445
+ platforms << "none" if platforms.empty?
446
+ platforms.join(",")
447
+ end
448
+
449
+ def workflow_types
450
+ types = []
451
+ workflow_files = collect_workflow_files
452
+
453
+ workflow_files.each do |file|
454
+ next unless File.exist?(file)
455
+
456
+ content = File.read(file)
457
+ content_lower = content.downcase
458
+
459
+ types << "build" if content_lower.match?(/\b(build|compile|docker build|go build|npm run build|maven|gradle)\b/)
460
+ types << "test" if content_lower.match?(/\btest[^-]|\bmake test\b/)
461
+ types << "unit-test" if content_lower.match?(/\b(unit[- ]test|unittest|test.*unit|jest|pytest|rspec|go test.*-short)\b/)
462
+ types << "integration-test" if content_lower.match?(/\b(integration[- ]test|test.*integration|e2e|end-to-end)\b/)
463
+ types << "smoke-test" if content_lower.match?(/\b(smoke[- ]test|test.*smoke)\b/)
464
+ types << "deploy" if content_lower.match?(/\b(deploy|push|publish|release|kubectl apply|helm (install|upgrade))\b/)
465
+ types << "lint" if content_lower.match?(/\b(lint|eslint|rubocop|pylint|golangci-lint|flake8|checkstyle)\b/)
466
+ types << "security-scan" if content_lower.match?(/\b(trivy|snyk|sonarqube|codeql|security[- ]scan|vulnerability|scan.*image|bundler-audit|brakeman|ruby_audit|npm audit|yarn audit|safety check|bandit|gosec)\b/)
467
+ types << "dependency-update" if content_lower.match?(/\b(dependabot|renovate|dependency.*update|update.*depend)\b/)
468
+ types << "ticket-creation" if content_lower.match?(/\b(jira|tosm|create.*ticket|create.*issue|atlassian)\b/)
469
+ end
470
+
471
+ # Check for dependency update config files
472
+ if File.exist?(File.join(@repo_path, ".github/dependabot.yml")) ||
473
+ File.exist?(File.join(@repo_path, ".github/dependabot.yaml")) ||
474
+ File.exist?(File.join(@repo_path, "renovate.json")) ||
475
+ File.exist?(File.join(@repo_path, ".renovaterc"))
476
+ types << "dependency-update"
477
+ end
478
+
479
+ types.uniq!
480
+ types << "none" if types.empty?
481
+ types.join(",")
482
+ end
483
+
484
+ def collect_workflow_files
485
+ files = []
486
+ files += Dir.glob(File.join(@repo_path, ".github/workflows/*.{yml,yaml}"))
487
+ files << File.join(@repo_path, ".gitlab-ci.yml") if File.exist?(File.join(@repo_path, ".gitlab-ci.yml"))
488
+ files << File.join(@repo_path, "Makefile") if File.exist?(File.join(@repo_path, "Makefile"))
489
+ files
490
+ end
491
+
492
+ def description
493
+ @description ||= extract_description
494
+ end
495
+
496
+ def documentation_links
497
+ @documentation_links ||= extract_links
498
+ end
499
+
500
+ def extract_links
501
+ readme_files = Dir.glob(File.join(@repo_path, "README*"), File::FNM_CASEFOLD)
502
+ readme_file = readme_files.first
503
+ return [] unless readme_file && File.exist?(readme_file)
504
+
505
+ content = read_file_with_encoding(readme_file)
506
+ return [] unless content
507
+
508
+ links = []
509
+
510
+ # Match markdown links: [text](url) - only http/https
511
+ content.scan(/\[([^\]]+)\]\(([^)]+)\)/).each do |text, url|
512
+ next unless url.match?(%r{^https?://})
513
+ next if text.match?(/^!/) # Skip images
514
+
515
+ url = url.strip.split(/\s+/).first
516
+ clean_text = text.strip.gsub(/[*_`~]/, "")
517
+ links << { "text" => clean_text, "url" => url }
518
+ end
519
+
520
+ # Match bare URLs (http/https)
521
+ content.scan(%r{(?<![(\[])(https?://[^\s<>)\]]+)}).flatten.each do |url|
522
+ next if links.any? { |link| link["url"] == url }
523
+
524
+ domain = begin
525
+ url.match(%r{https?://([^/]+)})[1]
526
+ rescue StandardError
527
+ url
528
+ end
529
+ links << { "text" => domain, "url" => url }
530
+ end
531
+
532
+ links.uniq { |link| link["url"] }
533
+ end
534
+
535
+ def extract_description
536
+ readme_files = Dir.glob(File.join(@repo_path, "README*"), File::FNM_CASEFOLD)
537
+ readme_file = readme_files.first
538
+ return nil unless readme_file && File.exist?(readme_file)
539
+
540
+ content = read_file_with_encoding(readme_file)
541
+ return nil unless content
542
+
543
+ description_lines = extract_description_lines(content)
544
+ return nil if description_lines.empty?
545
+
546
+ desc = description_lines.join("\n").strip
547
+ truncate_at_sentence_boundary(desc)
548
+ end
549
+
550
+ def read_file_with_encoding(file_path)
551
+ raw = File.read(file_path, mode: "rb")
552
+
553
+ if raw.start_with?("\xFF\xFE".b)
554
+ raw.force_encoding("UTF-16LE").encode("UTF-8")
555
+ elsif raw.start_with?("\xFE\xFF".b)
556
+ raw.force_encoding("UTF-16BE").encode("UTF-8")
557
+ elsif raw.start_with?("\xEF\xBB\xBF".b)
558
+ raw.force_encoding("UTF-8")[3..]
559
+ else
560
+ raw.force_encoding("UTF-8").encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
561
+ end
562
+ rescue StandardError
563
+ nil
564
+ end
565
+
566
+ def extract_description_lines(content)
567
+ content = content.sub(/^\uFEFF/, "")
568
+
569
+ lines = content.lines
570
+ description_lines = []
571
+ found_first_paragraph = false
572
+ blank_line_count = 0
573
+
574
+ lines.each do |line|
575
+ line = line.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
576
+ stripped = line.strip
577
+
578
+ unless found_first_paragraph
579
+ next if stripped.empty?
580
+ next if stripped.match?(/^#\s+/)
581
+ next if stripped.match?(/^\[!\[|^!\[/)
582
+ next if stripped.match?(/^\[.*\]\(.*\)$/) && !stripped.include?(" ")
583
+
584
+ found_first_paragraph = true
585
+ end
586
+
587
+ if stripped.empty?
588
+ blank_line_count += 1
589
+ break if blank_line_count >= 2 && description_lines.any?
590
+
591
+ description_lines << "" if description_lines.any?
592
+ next
593
+ else
594
+ blank_line_count = 0
595
+ end
596
+
597
+ break if stripped.match?(/^##\s+/)
598
+
599
+ description_lines << stripped
600
+ break if description_lines.join("\n").length > 1500
601
+ end
602
+
603
+ description_lines
604
+ end
605
+
606
+ def truncate_at_sentence_boundary(description)
607
+ return description if description.length <= 600
608
+
609
+ paragraphs = description.split(/\n\n+/)
610
+ result_paragraphs = []
611
+ current_length = 0
612
+
613
+ paragraphs.each do |para|
614
+ candidate_length = current_length + (result_paragraphs.empty? ? 0 : 2) + para.length
615
+ break if current_length >= 600 && candidate_length > 1200
616
+
617
+ result_paragraphs << para
618
+ current_length = candidate_length
619
+ break if current_length > 1200
620
+ end
621
+
622
+ return description if result_paragraphs.empty?
623
+
624
+ result_paragraphs.join("\n\n")
625
+ end
626
+ end