brightdata 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +38 -0
- data/LICENSE.txt +21 -0
- data/README.md +149 -0
- data/lib/brightdata/client.rb +25 -0
- data/lib/brightdata/datasets.rb +31 -0
- data/lib/brightdata/errors.rb +101 -0
- data/lib/brightdata/http.rb +195 -0
- data/lib/brightdata/linkedin/companies.rb +32 -0
- data/lib/brightdata/linkedin/endpoint.rb +195 -0
- data/lib/brightdata/linkedin/jobs.rb +83 -0
- data/lib/brightdata/linkedin/namespace.rb +32 -0
- data/lib/brightdata/linkedin/people.rb +39 -0
- data/lib/brightdata/linkedin/posts.rb +97 -0
- data/lib/brightdata/linkedin/profiles.rb +32 -0
- data/lib/brightdata/linkedin/types/company.rb +92 -0
- data/lib/brightdata/linkedin/types/company_url_input.rb +13 -0
- data/lib/brightdata/linkedin/types/discovered_profile.rb +45 -0
- data/lib/brightdata/linkedin/types/job.rb +54 -0
- data/lib/brightdata/linkedin/types/job_keyword_input.rb +44 -0
- data/lib/brightdata/linkedin/types/job_url_input.rb +13 -0
- data/lib/brightdata/linkedin/types/people_discover_input.rb +24 -0
- data/lib/brightdata/linkedin/types/post.rb +67 -0
- data/lib/brightdata/linkedin/types/post_company_url_input.rb +13 -0
- data/lib/brightdata/linkedin/types/post_profile_url_input.rb +13 -0
- data/lib/brightdata/linkedin/types/post_url_input.rb +13 -0
- data/lib/brightdata/linkedin/types/profile.rb +81 -0
- data/lib/brightdata/linkedin/types/profile_url_input.rb +14 -0
- data/lib/brightdata/live_trace.rb +124 -0
- data/lib/brightdata/result.rb +27 -0
- data/lib/brightdata/snapshot.rb +122 -0
- data/lib/brightdata/version.rb +6 -0
- data/lib/brightdata.rb +60 -0
- data/llm.md +109 -0
- metadata +193 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 22a0b2f8657a178d69eb1c6c3220db2d086425af48c8388b68b2633d6d9ff194
|
|
4
|
+
data.tar.gz: cd967fbe25a97ff0eb0cad7b1319a830c98684d406bac3add844dc075b3a1ab2
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: bec94e0b1b6d38365cd5f0d77579159ef0b3bbe359378c2f4f76a0bf984b52be62f010c608421b59a39d63cb17ce5a004cd7e440b2e87e5797c5a7d0913f94c1
|
|
7
|
+
data.tar.gz: 204b8987e39b9284fb956373a75f349b8ddcaccb62a6a917ef3a99de4942135486354bb8f90ec247c9ea2374632c408597a50b78bba8d0522f6d5eab493c55d2
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
5
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [0.1.0] - 2026-05-28
|
|
10
|
+
|
|
11
|
+
Initial public release.
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
- `BrightData::Client` with configurable `api_token`, `base_url`, and `logger`.
|
|
15
|
+
- LinkedIn endpoints under `client.linkedin`:
|
|
16
|
+
- `profiles` and `companies` (collect by URL)
|
|
17
|
+
- `jobs.collect_by_url`, `jobs.discover_by_url`, `jobs.discover_by_keyword`
|
|
18
|
+
- `posts.collect_by_url`, `posts.discover_by_url`,
|
|
19
|
+
`posts.discover_by_profile_url`, `posts.discover_by_company_url`
|
|
20
|
+
- `people.discover_new_profiles`
|
|
21
|
+
- Every endpoint exposes both `#scrape` (synchronous, parsed results) and
|
|
22
|
+
`#trigger` (asynchronous, returns a `Snapshot`).
|
|
23
|
+
- `BrightData::Snapshot#wait` polling with configurable `timeout` and
|
|
24
|
+
`poll_interval`, returning a `BrightData::Result` (`SimpleResult::Success` or
|
|
25
|
+
`Failure`).
|
|
26
|
+
- Typed `Data`-backed result objects (`Profile`, `Company`, `Job`, `Post`,
|
|
27
|
+
`DiscoveredProfile`) and input objects (`JobKeywordInput`,
|
|
28
|
+
`PeopleDiscoverInput`, and `*UrlInput` variants), each exposing typed readers
|
|
29
|
+
plus `#raw`.
|
|
30
|
+
- Error hierarchy rooted at `BrightData::Error`: `ConfigurationError`,
|
|
31
|
+
`ArgumentError`, `AuthError`, `HTTPError`, `RateLimitError` (`#retry_after`),
|
|
32
|
+
`ServerError`, `ScrapeTimeoutError` (`#snapshot`).
|
|
33
|
+
- Mutable `BrightData::Datasets::LINKEDIN` registry for overriding or adding
|
|
34
|
+
dataset IDs at runtime.
|
|
35
|
+
- Optional request/response tracing via `BRIGHTDATA_LIVE`, recorded under
|
|
36
|
+
`tmp/live/`.
|
|
37
|
+
- `llm.md` — single-file, LLM-friendly reference generated from YARD docs via
|
|
38
|
+
`bin/generate_llm.rb` and `bin/prepare_release`.
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lucian Ghinda
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# brightdata
|
|
2
|
+
|
|
3
|
+
A typed, ergonomic Ruby client for [Bright Data](https://brightdata.com)'s
|
|
4
|
+
Datasets v3 scraper APIs. Version 0.1.0 ships the LinkedIn endpoints.
|
|
5
|
+
|
|
6
|
+
## Installation
|
|
7
|
+
|
|
8
|
+
Add it to your Gemfile:
|
|
9
|
+
|
|
10
|
+
```ruby
|
|
11
|
+
gem "brightdata"
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Then run `bundle install`, or install it directly:
|
|
15
|
+
|
|
16
|
+
```sh
|
|
17
|
+
gem install brightdata
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Requires Ruby 3.4.4 or newer.
|
|
21
|
+
|
|
22
|
+
## Configuration
|
|
23
|
+
|
|
24
|
+
Create a client with your Bright Data API token:
|
|
25
|
+
|
|
26
|
+
```ruby
|
|
27
|
+
client = BrightData::Client.new(api_token: ENV.fetch("BRIGHTDATA_API_TOKEN"))
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Optional keyword arguments:
|
|
31
|
+
|
|
32
|
+
- `base_url:` — override the API host (defaults to `https://api.brightdata.com`).
|
|
33
|
+
- `logger:` — a `Logger` that receives one `debug` line per request.
|
|
34
|
+
|
|
35
|
+
## Synchronous vs. asynchronous
|
|
36
|
+
|
|
37
|
+
Every endpoint exposes two methods:
|
|
38
|
+
|
|
39
|
+
- `#scrape(...)` runs synchronously and returns parsed, typed results. Bright
|
|
40
|
+
Data caps synchronous scrapes at 60 seconds; if a job exceeds that,
|
|
41
|
+
`scrape` raises `BrightData::ScrapeTimeoutError`, which carries a resumable
|
|
42
|
+
snapshot (`error.snapshot`).
|
|
43
|
+
- `#trigger(...)` starts an asynchronous collection and returns a
|
|
44
|
+
`BrightData::Snapshot` you poll with `#wait`.
|
|
45
|
+
|
|
46
|
+
```ruby
|
|
47
|
+
# Synchronous
|
|
48
|
+
profiles = client.linkedin.profiles.scrape(
|
|
49
|
+
urls: ["https://www.linkedin.com/in/example/"]
|
|
50
|
+
)
|
|
51
|
+
profiles.first.name # => "Example Person"
|
|
52
|
+
|
|
53
|
+
# Asynchronous
|
|
54
|
+
snapshot = client.linkedin.profiles.trigger(
|
|
55
|
+
urls: ["https://www.linkedin.com/in/example/"]
|
|
56
|
+
)
|
|
57
|
+
result = snapshot.wait # blocks, polling progress until ready/failed/timeout
|
|
58
|
+
if result.success?
|
|
59
|
+
result.payload # => Array<BrightData::LinkedIn::Types::Profile>
|
|
60
|
+
else
|
|
61
|
+
result.error # => raw failure payload from Bright Data
|
|
62
|
+
end
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
`Snapshot#wait` accepts `timeout:` (default 300s) and `poll_interval:`
|
|
66
|
+
(default 5s), and raises `BrightData::ScrapeTimeoutError` if the deadline
|
|
67
|
+
passes before the snapshot reaches a terminal state.
|
|
68
|
+
|
|
69
|
+
## LinkedIn endpoints
|
|
70
|
+
|
|
71
|
+
| Call | Argument | Returns |
|
|
72
|
+
| --- | --- | --- |
|
|
73
|
+
| `linkedin.profiles` | `urls:` | `Types::Profile` |
|
|
74
|
+
| `linkedin.companies` | `urls:` | `Types::Company` |
|
|
75
|
+
| `linkedin.jobs.collect_by_url` | `urls:` | `Types::Job` |
|
|
76
|
+
| `linkedin.jobs.discover_by_url` | `urls:` | `Types::Job` |
|
|
77
|
+
| `linkedin.jobs.discover_by_keyword` | `queries:` (`Types::JobKeywordInput`) | `Types::Job` |
|
|
78
|
+
| `linkedin.posts.collect_by_url` | `urls:` | `Types::Post` |
|
|
79
|
+
| `linkedin.posts.discover_by_url` | `urls:` | `Types::Post` |
|
|
80
|
+
| `linkedin.posts.discover_by_profile_url` | `profile_urls:` | `Types::Post` |
|
|
81
|
+
| `linkedin.posts.discover_by_company_url` | `company_urls:` | `Types::Post` |
|
|
82
|
+
| `linkedin.people.discover_new_profiles` | `queries:` (`Types::PeopleDiscoverInput`) | `Types::DiscoveredProfile` |
|
|
83
|
+
|
|
84
|
+
### Discovery by keyword
|
|
85
|
+
|
|
86
|
+
```ruby
|
|
87
|
+
query = BrightData::LinkedIn::Types::JobKeywordInput.new(
|
|
88
|
+
location: "New York",
|
|
89
|
+
keyword: "ruby",
|
|
90
|
+
country: nil, time_range: nil, job_type: nil, experience_level: nil,
|
|
91
|
+
remote: nil, company: nil, selective_search: nil,
|
|
92
|
+
jobs_to_not_include: nil, location_radius: nil
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
jobs = client.linkedin.jobs.discover_by_keyword.scrape(queries: [query])
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
`nil` fields are omitted from the request payload.
|
|
99
|
+
|
|
100
|
+
## Result types
|
|
101
|
+
|
|
102
|
+
Results are immutable `Data` value objects (`Types::Profile`, `Types::Company`,
|
|
103
|
+
`Types::Job`, `Types::Post`, `Types::DiscoveredProfile`). Each exposes typed
|
|
104
|
+
readers for the common fields plus `#raw`, the full parsed response hash, so you
|
|
105
|
+
can reach fields the gem does not yet type:
|
|
106
|
+
|
|
107
|
+
```ruby
|
|
108
|
+
profile = profiles.first
|
|
109
|
+
profile.name # typed reader
|
|
110
|
+
profile.raw[:posts] # anything not yet typed
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Error handling
|
|
114
|
+
|
|
115
|
+
All errors inherit from `BrightData::Error`:
|
|
116
|
+
|
|
117
|
+
- `BrightData::ConfigurationError` — blank API token.
|
|
118
|
+
- `BrightData::ArgumentError` — bad argument shape (note: not Ruby's
|
|
119
|
+
`::ArgumentError`).
|
|
120
|
+
- `BrightData::AuthError` — 401/403 from the API.
|
|
121
|
+
- `BrightData::RateLimitError` — 429; exposes `#retry_after`.
|
|
122
|
+
- `BrightData::ServerError` — 5xx.
|
|
123
|
+
- `BrightData::HTTPError` — other transport failures and timeouts.
|
|
124
|
+
- `BrightData::ScrapeTimeoutError` — synchronous scrape exceeded the 60s cap;
|
|
125
|
+
recover via `error.snapshot.wait`.
|
|
126
|
+
|
|
127
|
+
```ruby
|
|
128
|
+
begin
|
|
129
|
+
client.linkedin.profiles.scrape(urls: urls)
|
|
130
|
+
rescue BrightData::ScrapeTimeoutError => e
|
|
131
|
+
e.snapshot.wait # fall back to async polling
|
|
132
|
+
rescue BrightData::RateLimitError => e
|
|
133
|
+
sleep(e.retry_after || 5)
|
|
134
|
+
retry
|
|
135
|
+
rescue BrightData::Error => e
|
|
136
|
+
warn "Bright Data request failed: #{e.message}"
|
|
137
|
+
end
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Documentation for AI agents
|
|
141
|
+
|
|
142
|
+
[`llm.md`](llm.md) is a single-file, LLM-friendly reference generated from the
|
|
143
|
+
gem's YARD documentation. Point a coding assistant at it for the full API
|
|
144
|
+
surface and usage examples. Regenerate it with `bin/prepare_release` (or
|
|
145
|
+
`bundle exec yardoc --format=markdown && bin/generate_llm.rb`).
|
|
146
|
+
|
|
147
|
+
## License
|
|
148
|
+
|
|
149
|
+
Released under the [MIT License](LICENSE.txt).
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BrightData
|
|
4
|
+
# Top-level Bright Data API client.
|
|
5
|
+
#
|
|
6
|
+
# @example
|
|
7
|
+
# client = BrightData::Client.new(api_token: ENV.fetch("BRIGHTDATA_API_TOKEN"))
|
|
8
|
+
# client.linkedin
|
|
9
|
+
class Client
|
|
10
|
+
# @return [BrightData::HTTP] underlying HTTP wrapper
|
|
11
|
+
attr_reader :http
|
|
12
|
+
|
|
13
|
+
# @return [BrightData::LinkedIn::Namespace] LinkedIn endpoint namespace
|
|
14
|
+
attr_reader :linkedin
|
|
15
|
+
|
|
16
|
+
# @param api_token [String] Bright Data API token
|
|
17
|
+
# @param base_url [String] override Bright Data API base URL
|
|
18
|
+
# @param logger [Logger, nil] optional logger for request tracing
|
|
19
|
+
# @raise [BrightData::ConfigurationError] if `api_token` is nil or empty
|
|
20
|
+
def initialize(api_token:, base_url: BrightData::HTTP::BASE_URL, logger: nil)
|
|
21
|
+
@http = HTTP.new(api_token:, base_url:, logger:)
|
|
22
|
+
@linkedin = LinkedIn::Namespace.new(http: @http)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BrightData
|
|
4
|
+
# Registry of Bright Data dataset IDs keyed by symbolic endpoint name.
|
|
5
|
+
module Datasets
|
|
6
|
+
# @return [Hash{Symbol=>String}] LinkedIn dataset IDs
|
|
7
|
+
LINKEDIN = { # rubocop:disable Style/MutableConstant -- intentionally mutable so callers can register or override dataset IDs
|
|
8
|
+
profiles_collect_by_url: "gd_l1viktl72bvl7bjuj0",
|
|
9
|
+
companies_collect_by_url: "gd_l1vikfnt1wgvvqz95w",
|
|
10
|
+
jobs_collect_by_url: "gd_lpfll7v5hcqtkxl6l",
|
|
11
|
+
jobs_discover_by_url: "gd_lpfll7v5hcqtkxl6l",
|
|
12
|
+
jobs_discover_by_keyword: "gd_lpfll7v5hcqtkxl6l",
|
|
13
|
+
posts_collect_by_url: "gd_lyy3tktm25m4avu764",
|
|
14
|
+
posts_discover_by_profile_url: "gd_lyy3tktm25m4avu764",
|
|
15
|
+
posts_discover_by_url: "gd_lyy3tktm25m4avu764",
|
|
16
|
+
posts_discover_by_company_url: "gd_lyy3tktm25m4avu764",
|
|
17
|
+
people_discover_new_profiles: "gd_m8d03he47z8nwb5xc"
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# Fetch a LinkedIn dataset ID.
|
|
21
|
+
#
|
|
22
|
+
# @param key [Symbol] symbolic endpoint name
|
|
23
|
+
# @return [String] dataset ID
|
|
24
|
+
# @raise [BrightData::ArgumentError] if key is unknown
|
|
25
|
+
def self.id_for(key)
|
|
26
|
+
LINKEDIN.fetch(key) do
|
|
27
|
+
raise ::BrightData::ArgumentError, "Unknown LinkedIn dataset key: #{key.inspect}"
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BrightData
|
|
4
|
+
# Base error class. Rescue this to catch any BrightData failure.
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when the gem is misconfigured, for example when `api_token` is blank.
|
|
8
|
+
class ConfigurationError < Error; end
|
|
9
|
+
|
|
10
|
+
# Raised when a caller passes an invalid argument shape or value.
|
|
11
|
+
#
|
|
12
|
+
# This class intentionally inherits from {BrightData::Error}, not from
|
|
13
|
+
# `::ArgumentError`. Use `rescue BrightData::ArgumentError` or
|
|
14
|
+
# `rescue BrightData::Error`.
|
|
15
|
+
class ArgumentError < Error; end
|
|
16
|
+
|
|
17
|
+
# Raised on 401/403 responses from the Bright Data API.
|
|
18
|
+
class AuthError < Error; end
|
|
19
|
+
|
|
20
|
+
# Base class for transport-level errors.
|
|
21
|
+
class HTTPError < Error
|
|
22
|
+
# @return [Integer, nil] HTTP status, or nil if the request never completed
|
|
23
|
+
attr_reader :status
|
|
24
|
+
|
|
25
|
+
# @return [String, nil] raw response body
|
|
26
|
+
attr_reader :body
|
|
27
|
+
|
|
28
|
+
# @return [Net::HTTPResponse, nil] raw Net::HTTP response
|
|
29
|
+
attr_reader :response
|
|
30
|
+
|
|
31
|
+
# @param message [String] human-readable error message
|
|
32
|
+
# @param status [Integer, nil] HTTP status, or nil if unavailable
|
|
33
|
+
# @param body [String, nil] raw response body
|
|
34
|
+
# @param response [Net::HTTPResponse, nil] raw Net::HTTP response
|
|
35
|
+
def initialize(message, status: nil, body: nil, response: nil)
|
|
36
|
+
super(message)
|
|
37
|
+
@status = status
|
|
38
|
+
@body = body
|
|
39
|
+
@response = response
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Raised on 429 Too Many Requests responses.
|
|
44
|
+
class RateLimitError < HTTPError
|
|
45
|
+
# @return [Integer, nil] value of the `Retry-After` header in seconds, or nil if absent
|
|
46
|
+
attr_reader :retry_after
|
|
47
|
+
|
|
48
|
+
# @param message [String] human-readable error message
|
|
49
|
+
# @param status [Integer] HTTP status
|
|
50
|
+
# @param body [String, nil] raw response body
|
|
51
|
+
# @param response [Net::HTTPResponse, nil] raw Net::HTTP response
|
|
52
|
+
# @param retry_after [Integer, nil] retry delay in seconds
|
|
53
|
+
def initialize(message, status: 429, body: nil, response: nil, retry_after: nil)
|
|
54
|
+
super(message, status:, body:, response:)
|
|
55
|
+
@retry_after = retry_after
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Raised on 5xx responses from the Bright Data API.
|
|
60
|
+
class ServerError < HTTPError; end
|
|
61
|
+
|
|
62
|
+
# Raised when `/scrape` exceeds Bright Data's 60-second synchronous cap.
|
|
63
|
+
class ScrapeTimeoutError < Error
|
|
64
|
+
# @return [String] snapshot ID returned by Bright Data
|
|
65
|
+
attr_reader :snapshot_id
|
|
66
|
+
|
|
67
|
+
# @return [BrightData::Snapshot, nil] resumable snapshot returned by Bright Data
|
|
68
|
+
attr_reader :snapshot
|
|
69
|
+
|
|
70
|
+
# @param message [String] human-readable error message
|
|
71
|
+
# @param snapshot_id [String] snapshot ID returned by Bright Data
|
|
72
|
+
# @param snapshot [BrightData::Snapshot, nil] snapshot object that can be waited on
|
|
73
|
+
def initialize(message, snapshot_id:, snapshot: nil)
|
|
74
|
+
super(message)
|
|
75
|
+
@snapshot_id = snapshot_id
|
|
76
|
+
@snapshot = snapshot
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Raised by explicit exception-based snapshot failure flows.
|
|
81
|
+
#
|
|
82
|
+
# `Snapshot#wait` normally returns `SimpleResult::Failure` on failed
|
|
83
|
+
# snapshots. This class is reserved for callers who opt into exception
|
|
84
|
+
# semantics in future API versions.
|
|
85
|
+
class SnapshotFailedError < Error
|
|
86
|
+
# @return [String] snapshot ID
|
|
87
|
+
attr_reader :snapshot_id
|
|
88
|
+
|
|
89
|
+
# @return [Hash, nil] failure details returned by Bright Data
|
|
90
|
+
attr_reader :details
|
|
91
|
+
|
|
92
|
+
# @param message [String] human-readable error message
|
|
93
|
+
# @param snapshot_id [String] snapshot ID
|
|
94
|
+
# @param details [Hash, nil] failure details returned by Bright Data
|
|
95
|
+
def initialize(message, snapshot_id:, details: nil)
|
|
96
|
+
super(message)
|
|
97
|
+
@snapshot_id = snapshot_id
|
|
98
|
+
@details = details
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "net/http"
|
|
5
|
+
require "openssl"
|
|
6
|
+
require "uri"
|
|
7
|
+
|
|
8
|
+
module BrightData
|
|
9
|
+
# Thin Net::HTTP wrapper. Single point of egress for the gem.
|
|
10
|
+
#
|
|
11
|
+
# Optional request tracing lives in {LiveTrace}, kept out of this class so the
|
|
12
|
+
# request path stays a small, readable Net::HTTP shim.
|
|
13
|
+
class HTTP # rubocop:disable Metrics/ClassLength -- deliberately one cohesive HTTP shim; splitting it would scatter the request path
|
|
14
|
+
# @return [String] default Bright Data API base URL
|
|
15
|
+
BASE_URL = "https://api.brightdata.com"
|
|
16
|
+
|
|
17
|
+
# Must be greater than Bright Data's `/scrape` 60-second API cap.
|
|
18
|
+
# @return [Integer] default socket read timeout in seconds
|
|
19
|
+
DEFAULT_TIMEOUT = 90
|
|
20
|
+
|
|
21
|
+
# @param api_token [String] Bright Data API token
|
|
22
|
+
# @param base_url [String] override for testing
|
|
23
|
+
# @param open_timeout [Integer] TCP open timeout in seconds
|
|
24
|
+
# @param read_timeout [Integer] socket read timeout in seconds
|
|
25
|
+
# @param logger [Logger, nil] optional logger for debug request traces
|
|
26
|
+
# @raise [BrightData::ConfigurationError] if `api_token` is nil or empty
|
|
27
|
+
def initialize(api_token:, base_url: BASE_URL, open_timeout: 10, read_timeout: DEFAULT_TIMEOUT, logger: nil)
|
|
28
|
+
raise ConfigurationError, "api_token is required" if api_token.nil? || api_token.empty?
|
|
29
|
+
|
|
30
|
+
@api_token = api_token
|
|
31
|
+
@base_url = base_url
|
|
32
|
+
@open_timeout = open_timeout
|
|
33
|
+
@read_timeout = read_timeout
|
|
34
|
+
@logger = logger
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# POST a JSON body.
|
|
38
|
+
#
|
|
39
|
+
# @param path [String] API path
|
|
40
|
+
# @param query [Hash] query string params
|
|
41
|
+
# @param body [Hash, nil] JSON body to encode
|
|
42
|
+
# @return [Hash, Array, nil] parsed JSON body
|
|
43
|
+
# @raise [BrightData::AuthError, BrightData::RateLimitError, BrightData::ServerError, BrightData::HTTPError]
|
|
44
|
+
def post(path:, query: {}, body: nil)
|
|
45
|
+
request(method: :post, path:, query:, body:)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# GET a path.
|
|
49
|
+
#
|
|
50
|
+
# @param path [String] API path
|
|
51
|
+
# @param query [Hash] query string params
|
|
52
|
+
# @return [Hash, Array, nil] parsed JSON body
|
|
53
|
+
# @raise [BrightData::AuthError, BrightData::RateLimitError, BrightData::ServerError, BrightData::HTTPError]
|
|
54
|
+
def get(path:, query: {})
|
|
55
|
+
request(method: :get, path:, query:)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def request(method:, path:, query: {}, body: nil)
|
|
61
|
+
uri = build_uri(path:, query:)
|
|
62
|
+
trace = LiveTrace.for(LiveTrace::Request.new(method:, path:, query:, body:, uri:))
|
|
63
|
+
|
|
64
|
+
response, duration = timed_perform(uri:, req: build_request(method:, uri:, body:))
|
|
65
|
+
trace.record_response(response:, duration:)
|
|
66
|
+
|
|
67
|
+
log_request(method:, path:, status: response.code.to_i, duration:)
|
|
68
|
+
|
|
69
|
+
handle_response(response)
|
|
70
|
+
rescue StandardError => e
|
|
71
|
+
trace&.record_error(e)
|
|
72
|
+
raise
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def timed_perform(uri:, req:)
|
|
76
|
+
started = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
77
|
+
|
|
78
|
+
response = perform(uri:, req:)
|
|
79
|
+
|
|
80
|
+
[response, Process.clock_gettime(Process::CLOCK_MONOTONIC) - started]
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def build_uri(path:, query:)
|
|
84
|
+
uri = URI.parse("#{@base_url}#{path}")
|
|
85
|
+
uri.query = URI.encode_www_form(query) unless query.empty?
|
|
86
|
+
|
|
87
|
+
uri
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def build_request(method:, uri:, body:)
|
|
91
|
+
klass = if method == :post
|
|
92
|
+
Net::HTTP::Post
|
|
93
|
+
else
|
|
94
|
+
Net::HTTP::Get
|
|
95
|
+
end
|
|
96
|
+
req = klass.new(uri)
|
|
97
|
+
|
|
98
|
+
apply_headers(req)
|
|
99
|
+
req.body = JSON.generate(body) if body
|
|
100
|
+
|
|
101
|
+
req
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def apply_headers(req)
|
|
105
|
+
req["Authorization"] = "Bearer #{@api_token}"
|
|
106
|
+
req["Content-Type"] = "application/json"
|
|
107
|
+
req["Accept"] = "application/json"
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def perform(uri:, req:) # rubocop:disable Metrics/MethodLength -- This is a simple method, the lines are from the Net::HTTP params
|
|
111
|
+
Net::HTTP.start(
|
|
112
|
+
uri.host,
|
|
113
|
+
uri.port,
|
|
114
|
+
use_ssl: uri.scheme == "https",
|
|
115
|
+
open_timeout: @open_timeout,
|
|
116
|
+
read_timeout: @read_timeout
|
|
117
|
+
) do |http|
|
|
118
|
+
http.request(req)
|
|
119
|
+
end
|
|
120
|
+
rescue Net::OpenTimeout, Net::ReadTimeout => e
|
|
121
|
+
raise HTTPError.new("Timeout: #{e.message}", status: nil, body: nil)
|
|
122
|
+
rescue SocketError, SystemCallError, OpenSSL::SSL::SSLError, IOError => e
|
|
123
|
+
raise HTTPError.new("Connection failed: #{e.message}", status: nil, body: nil)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def handle_response(response)
|
|
127
|
+
status = response.code.to_i
|
|
128
|
+
return parse_body(response) if (200..299).cover?(status)
|
|
129
|
+
|
|
130
|
+
raise error_for(response, status)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Factory mapping an error status to its exception. A dispatch at one level
|
|
134
|
+
# of abstraction, so it stays a single `case`.
|
|
135
|
+
def error_for(response, status) # rubocop:disable Metrics/MethodLength -- A case should be together all the time
|
|
136
|
+
body = response.body.to_s
|
|
137
|
+
case status
|
|
138
|
+
when 401, 403
|
|
139
|
+
AuthError.new("Bright Data API rejected the token (status #{status}): #{body}")
|
|
140
|
+
when 429
|
|
141
|
+
RateLimitError.new(
|
|
142
|
+
"Bright Data rate limit hit (status 429)",
|
|
143
|
+
status:,
|
|
144
|
+
body:,
|
|
145
|
+
response:,
|
|
146
|
+
retry_after: parse_retry_after(response)
|
|
147
|
+
)
|
|
148
|
+
when 500..599
|
|
149
|
+
ServerError.new("Bright Data server error (status #{status})", status:, body:, response:)
|
|
150
|
+
else
|
|
151
|
+
HTTPError.new("Unexpected HTTP status #{status}", status:, body:, response:)
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def parse_body(response)
|
|
156
|
+
text = response.body.to_s
|
|
157
|
+
return nil if text.empty?
|
|
158
|
+
|
|
159
|
+
JSON.parse(text, symbolize_names: true)
|
|
160
|
+
rescue JSON::ParserError => e
|
|
161
|
+
return parse_json_lines(text, response:) if json_lines?(text, response:)
|
|
162
|
+
|
|
163
|
+
raise HTTPError.new("Invalid JSON response: #{e.message}", status: response.code.to_i, body: text, response:)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def json_lines?(text, response:)
|
|
167
|
+
response["content-type"].to_s.include?("jsonl") || text.lines.count > 1
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def parse_json_lines(text, response:)
|
|
171
|
+
text.each_line.filter_map do |line|
|
|
172
|
+
stripped = line.strip
|
|
173
|
+
next if stripped.empty?
|
|
174
|
+
|
|
175
|
+
JSON.parse(stripped, symbolize_names: true)
|
|
176
|
+
end
|
|
177
|
+
rescue JSON::ParserError => e
|
|
178
|
+
raise HTTPError.new("Invalid JSONL response: #{e.message}", status: response.code.to_i, body: text, response:)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def parse_retry_after(response)
|
|
182
|
+
raw = response["retry-after"] || response["Retry-After"]
|
|
183
|
+
Integer(raw, 10) if raw&.match?(/\A\d+\z/)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def log_request(method:, path:, status:, duration:)
|
|
187
|
+
return unless @logger
|
|
188
|
+
|
|
189
|
+
@logger.debug do
|
|
190
|
+
format("[brightdata] %<method>s %<path>s -> %<status>d in %<duration>.3fs",
|
|
191
|
+
method: method.upcase, path:, status:, duration:)
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module BrightData
|
|
4
|
+
module LinkedIn
|
|
5
|
+
# `client.linkedin.companies` endpoint family for LinkedIn companies by URL.
|
|
6
|
+
#
|
|
7
|
+
# @example Trigger an async collection
|
|
8
|
+
# snapshot = client.linkedin.companies.trigger(urls: ["https://www.linkedin.com/company/example/"])
|
|
9
|
+
# @example Scrape synchronously
|
|
10
|
+
# companies = client.linkedin.companies.scrape(urls: ["https://www.linkedin.com/company/example/"])
|
|
11
|
+
#
|
|
12
|
+
# @!method trigger(urls:)
|
|
13
|
+
# @param urls [Array<String>] LinkedIn company URLs
|
|
14
|
+
# @return [BrightData::Snapshot]
|
|
15
|
+
# @raise [BrightData::ArgumentError] if `urls` is not an Array
|
|
16
|
+
# @!method scrape(urls:)
|
|
17
|
+
# @param urls [Array<String>] LinkedIn company URLs
|
|
18
|
+
# @return [Array<BrightData::LinkedIn::Types::Company>]
|
|
19
|
+
# @raise [BrightData::ArgumentError] if `urls` is not an Array
|
|
20
|
+
# @raise [BrightData::ScrapeTimeoutError] when results exceed Bright Data's synchronous cap
|
|
21
|
+
class Companies
|
|
22
|
+
include Endpoint
|
|
23
|
+
|
|
24
|
+
endpoint(
|
|
25
|
+
dataset_key: :companies_collect_by_url,
|
|
26
|
+
input: Types::CompanyUrlInput,
|
|
27
|
+
result: Types::Company,
|
|
28
|
+
param: :urls
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|