s3x 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7a54135015f94c443a05c94451b70d970659b7d1252cff79c93d5e069f2bc24b
4
+ data.tar.gz: 3718a99cf24339d1bf54c22d7f7c7177637814d0b79559d3e932a312de7ab335
5
+ SHA512:
6
+ metadata.gz: 161c7f727111bfed76fed30542f5a8cfc9d165fa404a4f9197d516575598594e217c838835e59244d7ad42ef6fa5b24660b5622c382d8bd6e20770b8864ba6b7
7
+ data.tar.gz: 1c8d04be08177dafd380be5680ba3bece0c79e8cd13ac25c3b258426cdb761f4577fdfc95017e58093d0699ad8fcfcaed97e54470d12e40edf03e6798c948b63
data/CHANGELOG.md ADDED
@@ -0,0 +1,10 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2024-04-24
9
+
10
+ - Initial release
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Dmytro Horoshko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,101 @@
1
+ # S3x
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/s3x.svg)](https://badge.fury.io/rb/s3x)
4
+ [![Test](https://github.com/ocvit/s3x/workflows/Test/badge.svg)](https://github.com/ocvit/s3x/actions)
5
+ [![Coverage Status](https://coveralls.io/repos/github/ocvit/s3x/badge.svg?branch=main)](https://coveralls.io/github/ocvit/s3x?branch=main)
6
+
7
+ Found something spicy? Want to check it out in more detail? Here's a tool for ya 😎
8
+
9
+ ## Installation
10
+
11
+ Install the gem and add to Gemfile:
12
+
13
+ ```sh
14
+ bundle add s3x
15
+ ```
16
+
17
+ Or install it manually:
18
+
19
+ ```sh
20
+ gem install s3x
21
+ ```
22
+
23
+ ## Configuration
24
+
25
+ Initialize a bucket of interest:
26
+
27
+ ```ruby
28
+ bucket = S3x::Bucket.new("http://ftp.ruby-lang.org/")
29
+ ```
30
+
31
+ You can set `prefix` to pre-filter items and/or override default `page_size`:
32
+
33
+ ```ruby
34
+ bucket = S3x::Bucket.new(
35
+ "http://ftp.ruby-lang.org/",
36
+ prefix: "pub/ruby/binaries", # default: nil
37
+ page_size: 666 # default: 1000
38
+ )
39
+ ```
40
+
41
+ ## Usage
42
+
43
+ Get first X (`page_size`) items:
44
+
45
+ ```ruby
46
+ bucket.items
47
+ # => [{
48
+ # key: "pub/media/irb_multiline.mp4",
49
+ # etag: "03fa58ca64375c23cb567be6b129ab11",
50
+ # size: 239988,
51
+ # storage_class: "INTELLIGENT_TIERING",
52
+ # last_modified: 2019-04-20 09:10:30 UTC
53
+ # }, ...]
54
+ ```
55
+
56
+ Get next page items:
57
+
58
+ ```ruby
59
+ bucket.next_items
60
+ # => [{...}, ...]
61
+ ```
62
+
63
+ Check if the next page actually exists:
64
+
65
+ ```ruby
66
+ bucket.next_items?
67
+ # => true/false
68
+ ```
69
+
70
+ Get all items in one take:
71
+
72
+ ```ruby
73
+ bucket.all_items
74
+ # => [{...}, ...]
75
+ ```
76
+
77
+ Download selected item using its `key`:
78
+
79
+ ```ruby
80
+ bucket.download("pub/misc/ci_versions/cruby-jruby.json")
81
+ # => "[\"3.1\",\"3.2\",\"3.3\",\"head\",\"jruby\",\"jruby-head\"]\n"
82
+ ```
83
+
84
+ ## Development
85
+
86
+ ```sh
87
+ bin/setup # install deps
88
+ bin/console # interactive prompt to play around
89
+ rake spec # run tests!
90
+ rake spec:no_vcr # run tests with VCR cassettes disabled!
91
+ rake rubocop # lint code!
92
+ rake rubocop:md # lint docs!
93
+ ```
94
+
95
+ ## Contributing
96
+
97
+ Bug reports and pull requests are welcome on GitHub at https://github.com/ocvit/s3x.
98
+
99
+ ## License
100
+
101
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rexml"
4
+ require "time"
5
+
6
+ module S3x
7
+ class Bucket
8
+ class Page
9
+ URL = "%<bucket_url>s/?list-type=2&prefix=%<prefix>s&max-keys=%<page_size>s&start-after=%<after>s"
10
+
11
+ attr_reader :bucket, :after
12
+
13
+ def initialize(bucket, after: nil)
14
+ @bucket = bucket
15
+ @after = after
16
+ end
17
+
18
+ def url
19
+ @url ||= URL % {
20
+ bucket_url: bucket.url,
21
+ prefix: bucket.prefix,
22
+ page_size: bucket.page_size,
23
+ after: after&.[](:key)
24
+ }
25
+ end
26
+
27
+ def name
28
+ @name ||= parse_name
29
+ end
30
+
31
+ def items
32
+ @items ||= parse_items
33
+ end
34
+
35
+ def truncated?
36
+ @truncated ||= parse_truncated
37
+ end
38
+
39
+ private
40
+
41
+ def xml
42
+ @xml ||= begin
43
+ xml_source = Http.get(url)
44
+ REXML::Document.new(xml_source)
45
+ end
46
+ end
47
+
48
+ def parse_name
49
+ xml.get_text("//Name").to_s
50
+ end
51
+
52
+ def parse_items
53
+ elements = xml.get_elements("//Contents")
54
+
55
+ elements.map do |element|
56
+ etag = element.get_text("ETag").to_s
57
+ etag_normalized = etag.gsub!("&quot;", "")
58
+
59
+ last_modified_string = element.get_text("LastModified").to_s
60
+ last_modified = Time.parse(last_modified_string)
61
+
62
+ {
63
+ key: element.get_text("Key").to_s,
64
+ etag: etag_normalized,
65
+ size: element.get_text("Size").to_s.to_i,
66
+ storage_class: element.get_text("StorageClass").to_s,
67
+ last_modified: last_modified
68
+ }
69
+ end
70
+ end
71
+
72
+ def parse_truncated
73
+ xml.get_text("//IsTruncated") == "true"
74
+ end
75
+ end
76
+ end
77
+ end
data/lib/s3x/bucket.rb ADDED
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module S3x
4
+ class Bucket
5
+ class FirstPageNotLoaded < StandardError
6
+ def message
7
+ "use #items first"
8
+ end
9
+ end
10
+
11
+ DEFAULT_PAGE_SIZE = 1000
12
+
13
+ attr_reader :url, :prefix, :page_size
14
+
15
+ def initialize(url, prefix: nil, page_size: DEFAULT_PAGE_SIZE)
16
+ @url = normalize_url(url)
17
+ @prefix = prefix
18
+ @page_size = page_size
19
+ end
20
+
21
+ def name
22
+ page.name
23
+ end
24
+
25
+ def items
26
+ page.items
27
+ end
28
+
29
+ def next_items
30
+ raise FirstPageNotLoaded unless @page
31
+
32
+ @page = next_page(page)
33
+ items
34
+ end
35
+
36
+ def next_items?
37
+ raise FirstPageNotLoaded unless @page
38
+
39
+ page.truncated?
40
+ end
41
+
42
+ def all_items
43
+ page = first_page
44
+ all = [page.items]
45
+
46
+ while page.truncated?
47
+ page = next_page(page)
48
+ all << page.items
49
+ end
50
+
51
+ all.flatten!
52
+ end
53
+
54
+ def download(item_key)
55
+ item_url = "#{url}/#{item_key}"
56
+ Http.get(item_url)
57
+ end
58
+
59
+ private
60
+
61
+ def page
62
+ @page ||= first_page
63
+ end
64
+
65
+ def first_page
66
+ Page.new(self)
67
+ end
68
+
69
+ def next_page(page)
70
+ Page.new(self, after: page.items.last)
71
+ end
72
+
73
+ def normalize_url(url)
74
+ uri = URI(url)
75
+ "#{uri.scheme}://#{uri.host}"
76
+ end
77
+ end
78
+ end
data/lib/s3x/http.rb ADDED
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+
5
+ module S3x
6
+ module Http
7
+ RETRIABLE_ERRORS = [
8
+ Errno::ECONNREFUSED,
9
+ Errno::ECONNRESET,
10
+ Errno::ETIMEDOUT,
11
+ Net::OpenTimeout,
12
+ Net::ReadTimeout,
13
+ Net::WriteTimeout,
14
+ OpenSSL::SSL::SSLError
15
+ ].freeze
16
+
17
+ MAX_RETRIES = 3
18
+
19
+ def self.get(url, retries: 0)
20
+ Net::HTTP.get(URI(url))
21
+ rescue *RETRIABLE_ERRORS
22
+ raise if retries > MAX_RETRIES
23
+
24
+ get(url, retries: retries + 1)
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module S3x
4
+ VERSION = "0.1.0"
5
+ end
data/lib/s3x.rb ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "s3x/version"
4
+ require_relative "s3x/http"
5
+ require_relative "s3x/bucket/page"
6
+ require_relative "s3x/bucket"
7
+
8
+ module S3x
9
+ end
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: s3x
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dmytro Horoshko
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-04-24 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Scrape public AWS S3 buckets with ease.
14
+ email:
15
+ - electric.molfar@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - CHANGELOG.md
21
+ - LICENSE.txt
22
+ - README.md
23
+ - lib/s3x.rb
24
+ - lib/s3x/bucket.rb
25
+ - lib/s3x/bucket/page.rb
26
+ - lib/s3x/http.rb
27
+ - lib/s3x/version.rb
28
+ homepage: https://github.com/ocvit/s3x
29
+ licenses:
30
+ - MIT
31
+ metadata:
32
+ bug_tracker_uri: https://github.com/ocvit/s3x/issues
33
+ changelog_uri: https://github.com/ocvit/s3x/blob/main/CHANGELOG.md
34
+ homepage_uri: https://github.com/ocvit/s3x
35
+ source_code_uri: https://github.com/ocvit/s3x
36
+ post_install_message:
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: '2.7'
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: '0'
50
+ requirements: []
51
+ rubygems_version: 3.5.3
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: Scrape public AWS S3 buckets with ease
55
+ test_files: []