star-dlp 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 25b1201d34bb3a3e4d219f9faadde2b8d34aba5747c1ceb926482699107b44e2
4
- data.tar.gz: f431fbe097ef52988772208ac1021d4e1dbd11f59fe16c4b8ab512e84ad19906
3
+ metadata.gz: 49aecf46afd8779a951f317d8412ae41d157bcb50d6df163db0eec69f556881b
4
+ data.tar.gz: 4f3b4809beb3fddc5508f2f2e55cd62012829ff6f6b142b28034eee4e2aaedc0
5
5
  SHA512:
6
- metadata.gz: a21e8d101a58153efd911c4974e25683291d9b1ae52dc1f519b8b01f93dbb679c23f0d442eaad8dbf5c1b941e9db6dfa9abea42bc917c1f2a6d65984590dd195
7
- data.tar.gz: 6783258316126bcdd95f5ebedd283d9ffa2ce38c6f24acfdd124e7e4ff584549cf1dc1ed9caa0d3118a1bd210ba2f5de64b51ac456777d6225e61d979480a4fc
6
+ metadata.gz: 824c986da6d7c0e30f058bec67b254d289024da4f7effdfbf1975af7d2a5414671e551d561df110331d5cdcff2a3c8f3427028600ade35d86fe3958264df370a
7
+ data.tar.gz: 18278facb4fd629b173af6f78f9a3c0b10c8ec2cd035590031f4a7fbe30d9a0d016d39fc2c6c2cfc3d285a5b52ac2cd41f6ea250a65a5e0e2589e3b54e213c1a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- star-dlp (0.1.1)
4
+ star-dlp (0.1.2)
5
5
  fileutils (~> 1.6)
6
6
  github_api (~> 0.19.0)
7
7
  json (~> 2.6)
@@ -2,10 +2,12 @@
2
2
 
3
3
  require "github_api"
4
4
  require "json"
5
+ require "tempfile"
5
6
  require "fileutils"
6
7
  require "time"
7
8
  require "base64"
8
9
  require "thread"
10
+ require "open3"
9
11
 
10
12
  module Star
11
13
  module Dlp
@@ -18,6 +20,33 @@ module Star
18
20
  DEFAULT_RETRY_COUNT = 5
19
21
  DEFAULT_RETRY_DELAY = 1 # seconds
20
22
 
23
+ # Supported README formats in order of preference
24
+ README_FORMATS = [
25
+ "README.md",
26
+ "README.markdown",
27
+ "readme.md",
28
+ "README.org",
29
+ "README.rst",
30
+ "README.txt",
31
+ "README.rdoc",
32
+ "README.adoc",
33
+ "README",
34
+ "readme.org",
35
+ "readme.rst",
36
+ "readme.txt",
37
+ "readme.rdoc",
38
+ "readme.adoc",
39
+ "readme"
40
+ ]
41
+
42
+ # Formats that need conversion to markdown
43
+ FORMATS_NEEDING_CONVERSION = {
44
+ ".org" => "org",
45
+ ".rst" => "rst",
46
+ ".txt" => "txt",
47
+ "" => "txt" # For files without extension
48
+ }
49
+
21
50
  def initialize(config, username, thread_count: DEFAULT_THREAD_COUNT, skip_readme: false, retry_count: DEFAULT_RETRY_COUNT, retry_delay: DEFAULT_RETRY_DELAY)
22
51
  @config = config
23
52
  @username = username
@@ -209,9 +238,9 @@ module Star
209
238
  ->(repo) { repo }, # Item name is the repo name itself
210
239
  ->(repo_full_name) {
211
240
  # Try to download README
212
- readme_content = fetch_readme(repo_full_name)
241
+ readme_result = fetch_readme(repo_full_name)
213
242
 
214
- if readme_content
243
+ if readme_result && readme_result[:content]
215
244
  # Get starred_at date if available, or use current date as fallback
216
245
  date = nil
217
246
  if repo_dates.key?(repo_full_name) && repo_dates[repo_full_name]
@@ -232,7 +261,9 @@ module Star
232
261
  if File.exist?(md_filepath)
233
262
  # Append README content to existing file
234
263
  File.open(md_filepath, 'a') do |file|
235
- file.puts "\n\n## README\n\n#{readme_content}\n"
264
+ file.puts "\n\n## README"
265
+ file.puts "\n*Format: #{readme_result[:format]}*\n" if readme_result[:format] != "markdown"
266
+ file.puts "\n#{readme_result[:content]}\n"
236
267
  end
237
268
  else
238
269
  # Create new file with repository information and README
@@ -245,10 +276,14 @@ module Star
245
276
  [View on GitHub](https://github.com/#{repo_full_name})
246
277
 
247
278
  ## README
248
-
249
- #{readme_content}
250
279
  MARKDOWN
251
280
 
281
+ # Add format note if not markdown
282
+ content += "\n*Format: #{readme_result[:format]}*\n" if readme_result[:format] != "markdown"
283
+
284
+ # Add README content
285
+ content += "\n#{readme_result[:content]}\n"
286
+
252
287
  File.write(md_filepath, content)
253
288
  end
254
289
 
@@ -282,63 +317,158 @@ module Star
282
317
  }
283
318
  end
284
319
 
285
- # Fetch README.md content from GitHub
320
+ # Fetch README content from GitHub
321
+ # Returns a hash with :content and :format keys, or nil if not found
286
322
  def fetch_readme(repo_full_name)
287
- begin
288
- # Get README content using GitHub API
289
- response = github.repos.contents.get(
290
- user: repo_full_name.split('/').first,
291
- repo: repo_full_name.split('/').last,
292
- path: 'README.md'
293
- )
294
-
295
- # Decode content from Base64
296
- if response.content && response.encoding == 'base64'
297
- return Base64.decode64(response.content).force_encoding('UTF-8')
298
- end
299
- rescue Github::Error::NotFound
300
- # Try README.markdown if README.md not found
323
+ # Try each README format in order
324
+ README_FORMATS.each do |readme_path|
301
325
  begin
326
+ # Get README content using GitHub API
302
327
  response = github.repos.contents.get(
303
328
  user: repo_full_name.split('/').first,
304
329
  repo: repo_full_name.split('/').last,
305
- path: 'README.markdown'
330
+ path: readme_path
306
331
  )
307
332
 
333
+ # Decode content from Base64
308
334
  if response.content && response.encoding == 'base64'
309
- return Base64.decode64(response.content).force_encoding('UTF-8')
335
+ content = Base64.decode64(response.content).force_encoding('UTF-8')
336
+
337
+ # Get file extension
338
+ ext = File.extname(readme_path).downcase
339
+
340
+ # Check if we need to convert the content
341
+ if FORMATS_NEEDING_CONVERSION.key?(ext)
342
+ format = FORMATS_NEEDING_CONVERSION[ext]
343
+ puts "Converting #{readme_path} from #{format} to markdown for #{repo_full_name}"
344
+
345
+ # Create a temporary file with the content
346
+ temp_file = Tempfile.new(['readme', ".#{format}"])
347
+ begin
348
+ temp_file.write(content)
349
+ temp_file.close
350
+
351
+ # Use pandoc to convert to markdown
352
+ markdown_content, status = convert_to_markdown(temp_file.path, format)
353
+
354
+ if status.success?
355
+ return { content: markdown_content, format: format }
356
+ else
357
+ puts "Pandoc conversion failed for #{repo_full_name}, using original content"
358
+ return { content: content, format: format }
359
+ end
360
+ ensure
361
+ temp_file.unlink
362
+ end
363
+ else
364
+ # Already markdown, no conversion needed
365
+ return { content: content, format: "markdown" }
366
+ end
310
367
  end
311
368
  rescue Github::Error::NotFound
312
- # Try readme.md (lowercase) if previous attempts failed
313
- begin
314
- response = github.repos.contents.get(
315
- user: repo_full_name.split('/').first,
316
- repo: repo_full_name.split('/').last,
317
- path: 'readme.md'
318
- )
369
+ # Try next format
370
+ next
371
+ rescue => e
372
+ puts "Error fetching #{readme_path} for #{repo_full_name}: #{e.message}"
373
+ next
374
+ end
375
+ end
376
+
377
+ # No README found in predefined formats, check for any readme-like file in the root directory
378
+ begin
379
+ # Get repository contents
380
+ contents = github.repos.contents.get(
381
+ user: repo_full_name.split('/').first,
382
+ repo: repo_full_name.split('/').last,
383
+ path: "" # Root directory
384
+ )
385
+
386
+ # Look for any file with name matching /readme/i
387
+ readme_file = contents.find { |item| item.type == "file" && item.name =~ /readme/i }
388
+
389
+ if readme_file
390
+ puts "Found alternative README file: #{readme_file.name} for #{repo_full_name}"
391
+
392
+ # Get README content
393
+ readme_content = github.repos.contents.get(
394
+ user: repo_full_name.split('/').first,
395
+ repo: repo_full_name.split('/').last,
396
+ path: readme_file.name
397
+ )
398
+
399
+ # Decode content from Base64
400
+ if readme_content.content && readme_content.encoding == 'base64'
401
+ content = Base64.decode64(readme_content.content).force_encoding('UTF-8')
402
+
403
+ # Get file extension
404
+ ext = File.extname(readme_file.name).downcase
319
405
 
320
- if response.content && response.encoding == 'base64'
321
- return Base64.decode64(response.content).force_encoding('UTF-8')
406
+ # Check if we need to convert the content
407
+ if FORMATS_NEEDING_CONVERSION.key?(ext)
408
+ format = FORMATS_NEEDING_CONVERSION[ext]
409
+ puts "Converting #{readme_file.name} from #{format} to markdown for #{repo_full_name}"
410
+
411
+ # Create a temporary file with the content
412
+ temp_file = Tempfile.new(['readme', ".#{format}"])
413
+ begin
414
+ temp_file.write(content)
415
+ temp_file.close
416
+
417
+ # Use pandoc to convert to markdown
418
+ markdown_content, status = convert_to_markdown(temp_file.path, format)
419
+
420
+ if status.success?
421
+ return { content: markdown_content, format: format }
422
+ else
423
+ puts "Pandoc conversion failed for #{repo_full_name}, using original content"
424
+ return { content: content, format: format }
425
+ end
426
+ ensure
427
+ temp_file.unlink
428
+ end
429
+ else
430
+ # Determine format based on extension or default to txt
431
+ format = ext.empty? ? "txt" : ext[1..]
432
+ # Use markdown format if extension suggests it's already markdown
433
+ format = "markdown" if [".md", ".markdown"].include?(ext)
434
+
435
+ return { content: content, format: format }
322
436
  end
323
- rescue Github::Error::NotFound
324
- # README not found
325
- return nil
326
- rescue => e
327
- puts "Error fetching lowercase readme.md for #{repo_full_name}: #{e.message}"
328
- raise e
329
437
  end
330
- rescue => e
331
- puts "Error fetching README.markdown for #{repo_full_name}: #{e.message}"
332
- raise e
333
438
  end
334
439
  rescue => e
335
- puts "Error fetching README.md for #{repo_full_name}: #{e.message}"
336
- raise e
440
+ puts "Error checking root directory for README-like files for #{repo_full_name}: #{e.message}"
337
441
  end
338
442
 
443
+ # No README found in any format
339
444
  nil
340
445
  end
341
446
 
447
+ # Convert content from a given format to markdown using pandoc
448
+ def convert_to_markdown(file_path, format)
449
+ begin
450
+ # Check if pandoc is installed
451
+ version_output, status = Open3.capture2e("pandoc --version")
452
+ unless status.success?
453
+ puts "Warning: pandoc is not installed or not in PATH. Cannot convert non-markdown formats."
454
+ return [File.read(file_path), status]
455
+ end
456
+
457
+ # Use pandoc to convert to markdown
458
+ output, status = Open3.capture2e("pandoc", "-f", format, "-t", "markdown", file_path)
459
+
460
+ if status.success?
461
+ return [output, status]
462
+ else
463
+ puts "Pandoc conversion failed: #{output}"
464
+ return [File.read(file_path), status]
465
+ end
466
+ rescue => e
467
+ puts "Error during conversion: #{e.message}"
468
+ return [File.read(file_path), OpenStruct.new(success?: false)]
469
+ end
470
+ end
471
+
342
472
  private
343
473
 
344
474
  # Process a list of items using multiple threads
@@ -513,9 +643,12 @@ module Star
513
643
 
514
644
  # Try to fetch README.md content if not skipped
515
645
  unless @skip_readme
516
- readme_content = fetch_readme(repo_full_name)
517
- if readme_content
518
- content += "\n\n## README\n\n#{readme_content}\n"
646
+ readme_result = fetch_readme(repo_full_name)
647
+ if readme_result && readme_result[:content]
648
+ content += "\n\n## README"
649
+ # Add format note if not markdown
650
+ content += "\n*Format: #{readme_result[:format]}*\n" if readme_result[:format] != "markdown"
651
+ content += "\n#{readme_result[:content]}\n"
519
652
  else
520
653
  content += "\n\n## Description\n\n#{get_description(star)}\n"
521
654
  end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Star
4
4
  module Dlp
5
- VERSION = "0.1.1"
5
+ VERSION = "0.1.2"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: star-dlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Liu Xiang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-16 00:00:00.000000000 Z
11
+ date: 2025-03-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: github_api