star-dlp 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/star/dlp/downloader.rb +178 -45
- data/lib/star/dlp/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49aecf46afd8779a951f317d8412ae41d157bcb50d6df163db0eec69f556881b
|
4
|
+
data.tar.gz: 4f3b4809beb3fddc5508f2f2e55cd62012829ff6f6b142b28034eee4e2aaedc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 824c986da6d7c0e30f058bec67b254d289024da4f7effdfbf1975af7d2a5414671e551d561df110331d5cdcff2a3c8f3427028600ade35d86fe3958264df370a
|
7
|
+
data.tar.gz: 18278facb4fd629b173af6f78f9a3c0b10c8ec2cd035590031f4a7fbe30d9a0d016d39fc2c6c2cfc3d285a5b52ac2cd41f6ea250a65a5e0e2589e3b54e213c1a
|
data/Gemfile.lock
CHANGED
data/lib/star/dlp/downloader.rb
CHANGED
@@ -2,10 +2,12 @@
|
|
2
2
|
|
3
3
|
require "github_api"
|
4
4
|
require "json"
|
5
|
+
require "tempfile"
|
5
6
|
require "fileutils"
|
6
7
|
require "time"
|
7
8
|
require "base64"
|
8
9
|
require "thread"
|
10
|
+
require "open3"
|
9
11
|
|
10
12
|
module Star
|
11
13
|
module Dlp
|
@@ -18,6 +20,33 @@ module Star
|
|
18
20
|
DEFAULT_RETRY_COUNT = 5
|
19
21
|
DEFAULT_RETRY_DELAY = 1 # seconds
|
20
22
|
|
23
|
+
# Supported README formats in order of preference
|
24
|
+
README_FORMATS = [
|
25
|
+
"README.md",
|
26
|
+
"README.markdown",
|
27
|
+
"readme.md",
|
28
|
+
"README.org",
|
29
|
+
"README.rst",
|
30
|
+
"README.txt",
|
31
|
+
"README.rdoc",
|
32
|
+
"README.adoc",
|
33
|
+
"README",
|
34
|
+
"readme.org",
|
35
|
+
"readme.rst",
|
36
|
+
"readme.txt",
|
37
|
+
"readme.rdoc",
|
38
|
+
"readme.adoc",
|
39
|
+
"readme"
|
40
|
+
]
|
41
|
+
|
42
|
+
# Formats that need conversion to markdown
|
43
|
+
FORMATS_NEEDING_CONVERSION = {
|
44
|
+
".org" => "org",
|
45
|
+
".rst" => "rst",
|
46
|
+
".txt" => "txt",
|
47
|
+
"" => "txt" # For files without extension
|
48
|
+
}
|
49
|
+
|
21
50
|
def initialize(config, username, thread_count: DEFAULT_THREAD_COUNT, skip_readme: false, retry_count: DEFAULT_RETRY_COUNT, retry_delay: DEFAULT_RETRY_DELAY)
|
22
51
|
@config = config
|
23
52
|
@username = username
|
@@ -209,9 +238,9 @@ module Star
|
|
209
238
|
->(repo) { repo }, # Item name is the repo name itself
|
210
239
|
->(repo_full_name) {
|
211
240
|
# Try to download README
|
212
|
-
|
241
|
+
readme_result = fetch_readme(repo_full_name)
|
213
242
|
|
214
|
-
if
|
243
|
+
if readme_result && readme_result[:content]
|
215
244
|
# Get starred_at date if available, or use current date as fallback
|
216
245
|
date = nil
|
217
246
|
if repo_dates.key?(repo_full_name) && repo_dates[repo_full_name]
|
@@ -232,7 +261,9 @@ module Star
|
|
232
261
|
if File.exist?(md_filepath)
|
233
262
|
# Append README content to existing file
|
234
263
|
File.open(md_filepath, 'a') do |file|
|
235
|
-
file.puts "\n\n## README
|
264
|
+
file.puts "\n\n## README"
|
265
|
+
file.puts "\n*Format: #{readme_result[:format]}*\n" if readme_result[:format] != "markdown"
|
266
|
+
file.puts "\n#{readme_result[:content]}\n"
|
236
267
|
end
|
237
268
|
else
|
238
269
|
# Create new file with repository information and README
|
@@ -245,10 +276,14 @@ module Star
|
|
245
276
|
[View on GitHub](https://github.com/#{repo_full_name})
|
246
277
|
|
247
278
|
## README
|
248
|
-
|
249
|
-
#{readme_content}
|
250
279
|
MARKDOWN
|
251
280
|
|
281
|
+
# Add format note if not markdown
|
282
|
+
content += "\n*Format: #{readme_result[:format]}*\n" if readme_result[:format] != "markdown"
|
283
|
+
|
284
|
+
# Add README content
|
285
|
+
content += "\n#{readme_result[:content]}\n"
|
286
|
+
|
252
287
|
File.write(md_filepath, content)
|
253
288
|
end
|
254
289
|
|
@@ -282,63 +317,158 @@ module Star
|
|
282
317
|
}
|
283
318
|
end
|
284
319
|
|
285
|
-
# Fetch README
|
320
|
+
# Fetch README content from GitHub
|
321
|
+
# Returns a hash with :content and :format keys, or nil if not found
|
286
322
|
def fetch_readme(repo_full_name)
|
287
|
-
|
288
|
-
|
289
|
-
response = github.repos.contents.get(
|
290
|
-
user: repo_full_name.split('/').first,
|
291
|
-
repo: repo_full_name.split('/').last,
|
292
|
-
path: 'README.md'
|
293
|
-
)
|
294
|
-
|
295
|
-
# Decode content from Base64
|
296
|
-
if response.content && response.encoding == 'base64'
|
297
|
-
return Base64.decode64(response.content).force_encoding('UTF-8')
|
298
|
-
end
|
299
|
-
rescue Github::Error::NotFound
|
300
|
-
# Try README.markdown if README.md not found
|
323
|
+
# Try each README format in order
|
324
|
+
README_FORMATS.each do |readme_path|
|
301
325
|
begin
|
326
|
+
# Get README content using GitHub API
|
302
327
|
response = github.repos.contents.get(
|
303
328
|
user: repo_full_name.split('/').first,
|
304
329
|
repo: repo_full_name.split('/').last,
|
305
|
-
path:
|
330
|
+
path: readme_path
|
306
331
|
)
|
307
332
|
|
333
|
+
# Decode content from Base64
|
308
334
|
if response.content && response.encoding == 'base64'
|
309
|
-
|
335
|
+
content = Base64.decode64(response.content).force_encoding('UTF-8')
|
336
|
+
|
337
|
+
# Get file extension
|
338
|
+
ext = File.extname(readme_path).downcase
|
339
|
+
|
340
|
+
# Check if we need to convert the content
|
341
|
+
if FORMATS_NEEDING_CONVERSION.key?(ext)
|
342
|
+
format = FORMATS_NEEDING_CONVERSION[ext]
|
343
|
+
puts "Converting #{readme_path} from #{format} to markdown for #{repo_full_name}"
|
344
|
+
|
345
|
+
# Create a temporary file with the content
|
346
|
+
temp_file = Tempfile.new(['readme', ".#{format}"])
|
347
|
+
begin
|
348
|
+
temp_file.write(content)
|
349
|
+
temp_file.close
|
350
|
+
|
351
|
+
# Use pandoc to convert to markdown
|
352
|
+
markdown_content, status = convert_to_markdown(temp_file.path, format)
|
353
|
+
|
354
|
+
if status.success?
|
355
|
+
return { content: markdown_content, format: format }
|
356
|
+
else
|
357
|
+
puts "Pandoc conversion failed for #{repo_full_name}, using original content"
|
358
|
+
return { content: content, format: format }
|
359
|
+
end
|
360
|
+
ensure
|
361
|
+
temp_file.unlink
|
362
|
+
end
|
363
|
+
else
|
364
|
+
# Already markdown, no conversion needed
|
365
|
+
return { content: content, format: "markdown" }
|
366
|
+
end
|
310
367
|
end
|
311
368
|
rescue Github::Error::NotFound
|
312
|
-
# Try
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
369
|
+
# Try next format
|
370
|
+
next
|
371
|
+
rescue => e
|
372
|
+
puts "Error fetching #{readme_path} for #{repo_full_name}: #{e.message}"
|
373
|
+
next
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
# No README found in predefined formats, check for any readme-like file in the root directory
|
378
|
+
begin
|
379
|
+
# Get repository contents
|
380
|
+
contents = github.repos.contents.get(
|
381
|
+
user: repo_full_name.split('/').first,
|
382
|
+
repo: repo_full_name.split('/').last,
|
383
|
+
path: "" # Root directory
|
384
|
+
)
|
385
|
+
|
386
|
+
# Look for any file with name matching /readme/i
|
387
|
+
readme_file = contents.find { |item| item.type == "file" && item.name =~ /readme/i }
|
388
|
+
|
389
|
+
if readme_file
|
390
|
+
puts "Found alternative README file: #{readme_file.name} for #{repo_full_name}"
|
391
|
+
|
392
|
+
# Get README content
|
393
|
+
readme_content = github.repos.contents.get(
|
394
|
+
user: repo_full_name.split('/').first,
|
395
|
+
repo: repo_full_name.split('/').last,
|
396
|
+
path: readme_file.name
|
397
|
+
)
|
398
|
+
|
399
|
+
# Decode content from Base64
|
400
|
+
if readme_content.content && readme_content.encoding == 'base64'
|
401
|
+
content = Base64.decode64(readme_content.content).force_encoding('UTF-8')
|
402
|
+
|
403
|
+
# Get file extension
|
404
|
+
ext = File.extname(readme_file.name).downcase
|
319
405
|
|
320
|
-
if
|
321
|
-
|
406
|
+
# Check if we need to convert the content
|
407
|
+
if FORMATS_NEEDING_CONVERSION.key?(ext)
|
408
|
+
format = FORMATS_NEEDING_CONVERSION[ext]
|
409
|
+
puts "Converting #{readme_file.name} from #{format} to markdown for #{repo_full_name}"
|
410
|
+
|
411
|
+
# Create a temporary file with the content
|
412
|
+
temp_file = Tempfile.new(['readme', ".#{format}"])
|
413
|
+
begin
|
414
|
+
temp_file.write(content)
|
415
|
+
temp_file.close
|
416
|
+
|
417
|
+
# Use pandoc to convert to markdown
|
418
|
+
markdown_content, status = convert_to_markdown(temp_file.path, format)
|
419
|
+
|
420
|
+
if status.success?
|
421
|
+
return { content: markdown_content, format: format }
|
422
|
+
else
|
423
|
+
puts "Pandoc conversion failed for #{repo_full_name}, using original content"
|
424
|
+
return { content: content, format: format }
|
425
|
+
end
|
426
|
+
ensure
|
427
|
+
temp_file.unlink
|
428
|
+
end
|
429
|
+
else
|
430
|
+
# Determine format based on extension or default to txt
|
431
|
+
format = ext.empty? ? "txt" : ext[1..]
|
432
|
+
# Use markdown format if extension suggests it's already markdown
|
433
|
+
format = "markdown" if [".md", ".markdown"].include?(ext)
|
434
|
+
|
435
|
+
return { content: content, format: format }
|
322
436
|
end
|
323
|
-
rescue Github::Error::NotFound
|
324
|
-
# README not found
|
325
|
-
return nil
|
326
|
-
rescue => e
|
327
|
-
puts "Error fetching lowercase readme.md for #{repo_full_name}: #{e.message}"
|
328
|
-
raise e
|
329
437
|
end
|
330
|
-
rescue => e
|
331
|
-
puts "Error fetching README.markdown for #{repo_full_name}: #{e.message}"
|
332
|
-
raise e
|
333
438
|
end
|
334
439
|
rescue => e
|
335
|
-
puts "Error
|
336
|
-
raise e
|
440
|
+
puts "Error checking root directory for README-like files for #{repo_full_name}: #{e.message}"
|
337
441
|
end
|
338
442
|
|
443
|
+
# No README found in any format
|
339
444
|
nil
|
340
445
|
end
|
341
446
|
|
447
|
+
# Convert content from a given format to markdown using pandoc
|
448
|
+
def convert_to_markdown(file_path, format)
|
449
|
+
begin
|
450
|
+
# Check if pandoc is installed
|
451
|
+
version_output, status = Open3.capture2e("pandoc --version")
|
452
|
+
unless status.success?
|
453
|
+
puts "Warning: pandoc is not installed or not in PATH. Cannot convert non-markdown formats."
|
454
|
+
return [File.read(file_path), status]
|
455
|
+
end
|
456
|
+
|
457
|
+
# Use pandoc to convert to markdown
|
458
|
+
output, status = Open3.capture2e("pandoc", "-f", format, "-t", "markdown", file_path)
|
459
|
+
|
460
|
+
if status.success?
|
461
|
+
return [output, status]
|
462
|
+
else
|
463
|
+
puts "Pandoc conversion failed: #{output}"
|
464
|
+
return [File.read(file_path), status]
|
465
|
+
end
|
466
|
+
rescue => e
|
467
|
+
puts "Error during conversion: #{e.message}"
|
468
|
+
return [File.read(file_path), OpenStruct.new(success?: false)]
|
469
|
+
end
|
470
|
+
end
|
471
|
+
|
342
472
|
private
|
343
473
|
|
344
474
|
# Process a list of items using multiple threads
|
@@ -513,9 +643,12 @@ module Star
|
|
513
643
|
|
514
644
|
# Try to fetch README.md content if not skipped
|
515
645
|
unless @skip_readme
|
516
|
-
|
517
|
-
if
|
518
|
-
content += "\n\n## README
|
646
|
+
readme_result = fetch_readme(repo_full_name)
|
647
|
+
if readme_result && readme_result[:content]
|
648
|
+
content += "\n\n## README"
|
649
|
+
# Add format note if not markdown
|
650
|
+
content += "\n*Format: #{readme_result[:format]}*\n" if readme_result[:format] != "markdown"
|
651
|
+
content += "\n#{readme_result[:content]}\n"
|
519
652
|
else
|
520
653
|
content += "\n\n## Description\n\n#{get_description(star)}\n"
|
521
654
|
end
|
data/lib/star/dlp/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: star-dlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Liu Xiang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-03-
|
11
|
+
date: 2025-03-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: github_api
|