sec_api 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.devcontainer/Dockerfile +54 -0
- data/.devcontainer/README.md +178 -0
- data/.devcontainer/devcontainer.json +46 -0
- data/.devcontainer/docker-compose.yml +28 -0
- data/.devcontainer/post-create.sh +51 -0
- data/.devcontainer/post-start.sh +44 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/CHANGELOG.md +5 -0
- data/CLAUDE.md +0 -0
- data/LICENSE.txt +21 -0
- data/MIGRATION.md +274 -0
- data/README.md +370 -0
- data/Rakefile +10 -0
- data/config/secapi.yml.example +57 -0
- data/docs/development-guide.md +291 -0
- data/docs/enumerator_pattern_design.md +483 -0
- data/docs/examples/README.md +58 -0
- data/docs/examples/backfill_filings.rb +419 -0
- data/docs/examples/instrumentation.rb +583 -0
- data/docs/examples/query_builder.rb +308 -0
- data/docs/examples/streaming_notifications.rb +491 -0
- data/docs/index.md +244 -0
- data/docs/migration-guide-v1.md +1091 -0
- data/docs/pre-review-checklist.md +145 -0
- data/docs/project-overview.md +90 -0
- data/docs/project-scan-report.json +60 -0
- data/docs/source-tree-analysis.md +190 -0
- data/lib/sec_api/callback_helper.rb +49 -0
- data/lib/sec_api/client.rb +606 -0
- data/lib/sec_api/collections/filings.rb +267 -0
- data/lib/sec_api/collections/fulltext_results.rb +86 -0
- data/lib/sec_api/config.rb +590 -0
- data/lib/sec_api/deep_freezable.rb +42 -0
- data/lib/sec_api/errors/authentication_error.rb +24 -0
- data/lib/sec_api/errors/configuration_error.rb +5 -0
- data/lib/sec_api/errors/error.rb +75 -0
- data/lib/sec_api/errors/network_error.rb +26 -0
- data/lib/sec_api/errors/not_found_error.rb +23 -0
- data/lib/sec_api/errors/pagination_error.rb +28 -0
- data/lib/sec_api/errors/permanent_error.rb +29 -0
- data/lib/sec_api/errors/rate_limit_error.rb +57 -0
- data/lib/sec_api/errors/reconnection_error.rb +34 -0
- data/lib/sec_api/errors/server_error.rb +25 -0
- data/lib/sec_api/errors/transient_error.rb +28 -0
- data/lib/sec_api/errors/validation_error.rb +23 -0
- data/lib/sec_api/extractor.rb +122 -0
- data/lib/sec_api/filing_journey.rb +477 -0
- data/lib/sec_api/mapping.rb +125 -0
- data/lib/sec_api/metrics_collector.rb +411 -0
- data/lib/sec_api/middleware/error_handler.rb +250 -0
- data/lib/sec_api/middleware/instrumentation.rb +186 -0
- data/lib/sec_api/middleware/rate_limiter.rb +541 -0
- data/lib/sec_api/objects/data_file.rb +34 -0
- data/lib/sec_api/objects/document_format_file.rb +45 -0
- data/lib/sec_api/objects/entity.rb +92 -0
- data/lib/sec_api/objects/extracted_data.rb +118 -0
- data/lib/sec_api/objects/fact.rb +147 -0
- data/lib/sec_api/objects/filing.rb +197 -0
- data/lib/sec_api/objects/fulltext_result.rb +66 -0
- data/lib/sec_api/objects/period.rb +96 -0
- data/lib/sec_api/objects/stream_filing.rb +194 -0
- data/lib/sec_api/objects/xbrl_data.rb +356 -0
- data/lib/sec_api/query.rb +423 -0
- data/lib/sec_api/rate_limit_state.rb +130 -0
- data/lib/sec_api/rate_limit_tracker.rb +154 -0
- data/lib/sec_api/stream.rb +841 -0
- data/lib/sec_api/structured_logger.rb +199 -0
- data/lib/sec_api/types.rb +32 -0
- data/lib/sec_api/version.rb +42 -0
- data/lib/sec_api/xbrl.rb +220 -0
- data/lib/sec_api.rb +137 -0
- data/sig/sec_api.rbs +4 -0
- metadata +217 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module SecApi
|
|
2
|
+
# Error Taxonomy (Architecture ADR-2: Error Handling Strategy)
|
|
3
|
+
#
|
|
4
|
+
# SecApi uses a type-based retry taxonomy to distinguish retryable from non-retryable failures:
|
|
5
|
+
#
|
|
6
|
+
# SecApi::Error (base)
|
|
7
|
+
# ├── TransientError (retryable) - Network issues, server errors, rate limits
|
|
8
|
+
# │ ├── NetworkError - Timeouts, connection failures, SSL errors
|
|
9
|
+
# │ ├── ServerError - HTTP 5xx responses
|
|
10
|
+
# │ └── RateLimitError - HTTP 429 responses
|
|
11
|
+
# └── PermanentError (fail-fast) - Client errors that require code/config changes
|
|
12
|
+
# ├── AuthenticationError - HTTP 401, 403
|
|
13
|
+
# ├── NotFoundError - HTTP 404
|
|
14
|
+
# └── ValidationError - HTTP 400, 422, XBRL validation
|
|
15
|
+
#
|
|
16
|
+
# Design rationale: The retry middleware checks `error.is_a?(TransientError)` to determine
|
|
17
|
+
# retry eligibility. This enables automatic recovery for temporary issues (NFR5: 95%+ recovery)
|
|
18
|
+
# while failing fast on permanent errors to avoid wasting resources.
|
|
19
|
+
#
|
|
20
|
+
# Base error class for all sec_api errors.
|
|
21
|
+
#
|
|
22
|
+
# All errors include a request_id for correlation with logs and
|
|
23
|
+
# instrumentation callbacks. When request_id is present, error messages
|
|
24
|
+
# are automatically prefixed with `[request_id]` for easy log correlation.
|
|
25
|
+
#
|
|
26
|
+
# @example Accessing request_id from error
|
|
27
|
+
# begin
|
|
28
|
+
# client.query.ticker("AAPL").search
|
|
29
|
+
# rescue SecApi::Error => e
|
|
30
|
+
# logger.error("Request failed", request_id: e.request_id, error: e.message)
|
|
31
|
+
# Bugsnag.notify(e, request_id: e.request_id)
|
|
32
|
+
# end
|
|
33
|
+
#
|
|
34
|
+
# @example Error message format with request_id
|
|
35
|
+
# # When request_id is present:
|
|
36
|
+
# # => "[abc123-def456] Rate limit exceeded (429 Too Many Requests)."
|
|
37
|
+
# #
|
|
38
|
+
# # When request_id is nil or empty:
|
|
39
|
+
# # => "Rate limit exceeded (429 Too Many Requests)."
|
|
40
|
+
#
|
|
41
|
+
# @example Correlating with distributed tracing
|
|
42
|
+
# begin
|
|
43
|
+
# client.query.ticker("AAPL").search
|
|
44
|
+
# rescue SecApi::Error => e
|
|
45
|
+
# # The request_id matches the trace ID from your APM system
|
|
46
|
+
# # if you configured external request_id via custom middleware
|
|
47
|
+
# Datadog.tracer.active_span&.set_tag('sec_api.request_id', e.request_id)
|
|
48
|
+
# end
|
|
49
|
+
#
|
|
50
|
+
class Error < StandardError
|
|
51
|
+
# The unique request correlation ID for this error.
|
|
52
|
+
# @return [String, nil] UUID request ID, or nil if not available
|
|
53
|
+
attr_reader :request_id
|
|
54
|
+
|
|
55
|
+
# Creates a new error with optional request correlation ID.
|
|
56
|
+
#
|
|
57
|
+
# @param message [String] Error message
|
|
58
|
+
# @param request_id [String, nil] Request correlation ID for tracing
|
|
59
|
+
def initialize(message = nil, request_id: nil)
|
|
60
|
+
@request_id = request_id
|
|
61
|
+
super(build_message(message))
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
# Builds the error message, optionally prefixing with request_id.
|
|
67
|
+
#
|
|
68
|
+
# @param message [String, nil] Original error message
|
|
69
|
+
# @return [String, nil] Formatted message with request_id prefix if present
|
|
70
|
+
def build_message(message)
|
|
71
|
+
return message if @request_id.nil? || @request_id.to_s.empty?
|
|
72
|
+
"[#{@request_id}] #{message}"
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Raised when network connectivity issues occur (timeouts, connection failures).
|
|
5
|
+
#
|
|
6
|
+
# Why TransientError? Network issues are inherently temporary - a brief blip,
|
|
7
|
+
# overloaded router, or momentary DNS failure. The request is valid; the network
|
|
8
|
+
# path is temporarily broken. High probability of success on retry.
|
|
9
|
+
#
|
|
10
|
+
# Wrapped Faraday exceptions: TimeoutError, ConnectionFailed, SSLError.
|
|
11
|
+
#
|
|
12
|
+
# This is a transient error - the retry middleware will automatically
|
|
13
|
+
# retry the request. Network errors represent temporary connectivity issues
|
|
14
|
+
# that may resolve on subsequent attempts.
|
|
15
|
+
#
|
|
16
|
+
# @example Handling network errors
|
|
17
|
+
# begin
|
|
18
|
+
# client.query.ticker("AAPL").search
|
|
19
|
+
# rescue SecApi::NetworkError => e
|
|
20
|
+
# # Retries exhausted - persistent connectivity issue
|
|
21
|
+
# logger.error("Network error: #{e.message}")
|
|
22
|
+
# check_network_status
|
|
23
|
+
# end
|
|
24
|
+
class NetworkError < TransientError
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Raised when a requested resource is not found (404 Not Found).
|
|
5
|
+
#
|
|
6
|
+
# Why PermanentError? The resource genuinely doesn't exist - invalid ticker,
|
|
7
|
+
# nonexistent CIK, or filing not in database. Retrying won't create it.
|
|
8
|
+
# User needs to fix their query parameters or check that the resource exists.
|
|
9
|
+
#
|
|
10
|
+
# This is a permanent error - the requested ticker, CIK, or filing does not exist.
|
|
11
|
+
# Retrying won't help; the query parameters need to be corrected.
|
|
12
|
+
#
|
|
13
|
+
# @example Handling not found errors
|
|
14
|
+
# begin
|
|
15
|
+
# client.query.ticker("INVALID").search
|
|
16
|
+
# rescue SecApi::NotFoundError => e
|
|
17
|
+
# # Correct the ticker symbol or filing identifier
|
|
18
|
+
# logger.warn("Resource not found: #{e.message}")
|
|
19
|
+
# prompt_user_for_valid_ticker
|
|
20
|
+
# end
|
|
21
|
+
class NotFoundError < PermanentError
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Raised when a pagination operation cannot be completed.
|
|
5
|
+
#
|
|
6
|
+
# This error is raised when attempting to fetch the next page of results
|
|
7
|
+
# when no more pages are available. It inherits from PermanentError because
|
|
8
|
+
# retrying the operation will not resolve the issue.
|
|
9
|
+
#
|
|
10
|
+
# @example Handling pagination end
|
|
11
|
+
# begin
|
|
12
|
+
# next_page = filings.fetch_next_page
|
|
13
|
+
# rescue SecApi::PaginationError => e
|
|
14
|
+
# puts "No more pages available"
|
|
15
|
+
# end
|
|
16
|
+
#
|
|
17
|
+
# @example Checking before fetching
|
|
18
|
+
# if filings.has_more?
|
|
19
|
+
# next_page = filings.fetch_next_page
|
|
20
|
+
# else
|
|
21
|
+
# puts "Already on the last page"
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
# @see SecApi::Collections::Filings#fetch_next_page
|
|
25
|
+
# @see SecApi::Collections::Filings#has_more?
|
|
26
|
+
class PaginationError < PermanentError
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Base class for all non-retryable (permanent) errors.
|
|
5
|
+
#
|
|
6
|
+
# Design rationale: PermanentError signals the retry middleware to fail immediately
|
|
7
|
+
# without retrying. Retrying a 401 (bad API key) or 404 (nonexistent resource) wastes
|
|
8
|
+
# resources and delays the inevitable failure. Fail fast with a clear message instead.
|
|
9
|
+
#
|
|
10
|
+
# These errors require human intervention - code changes, configuration fixes,
|
|
11
|
+
# or different input parameters. The same request will always fail.
|
|
12
|
+
#
|
|
13
|
+
# Permanent errors represent failures that won't be resolved by retrying,
|
|
14
|
+
# such as authentication failures, validation errors, or resource not found.
|
|
15
|
+
# These errors require code or configuration changes to resolve.
|
|
16
|
+
#
|
|
17
|
+
# @example Catching all permanent errors
|
|
18
|
+
# begin
|
|
19
|
+
# client.query.ticker("INVALID").search
|
|
20
|
+
# rescue SecApi::PermanentError => e
|
|
21
|
+
# # No retry will help - requires action
|
|
22
|
+
# logger.error("Permanent failure: #{e.message}")
|
|
23
|
+
# notify_developer(e)
|
|
24
|
+
# end
|
|
25
|
+
#
|
|
26
|
+
# @see TransientError for retryable errors
|
|
27
|
+
class PermanentError < Error
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Raised when sec-api.io rate limit is exceeded (429 Too Many Requests).
|
|
5
|
+
#
|
|
6
|
+
# Why TransientError? Rate limits reset after a time window (typically 60s).
|
|
7
|
+
# The request is valid - we just hit a temporary capacity limit. Worth waiting
|
|
8
|
+
# and retrying automatically rather than failing to the user. (FR5.4: auto-resume)
|
|
9
|
+
#
|
|
10
|
+
# This is a transient error - the retry middleware will automatically
|
|
11
|
+
# retry the request after waiting for the rate limit to reset.
|
|
12
|
+
#
|
|
13
|
+
# The error includes retry context when available from response headers:
|
|
14
|
+
# - {#retry_after}: Duration to wait (from Retry-After header)
|
|
15
|
+
# - {#reset_at}: Timestamp when rate limit resets (from X-RateLimit-Reset header)
|
|
16
|
+
#
|
|
17
|
+
# @example Handling rate limits
|
|
18
|
+
# begin
|
|
19
|
+
# client.query.ticker("AAPL").search
|
|
20
|
+
# rescue SecApi::RateLimitError => e
|
|
21
|
+
# # Retries exhausted - rate limit hit repeatedly
|
|
22
|
+
# logger.warn("Rate limit exceeded: #{e.message}")
|
|
23
|
+
# if e.retry_after
|
|
24
|
+
# logger.info("Server suggests waiting #{e.retry_after} seconds")
|
|
25
|
+
# end
|
|
26
|
+
# notify_ops_team(e)
|
|
27
|
+
# end
|
|
28
|
+
#
|
|
29
|
+
# @example Checking reset time
|
|
30
|
+
# rescue SecApi::RateLimitError => e
|
|
31
|
+
# if e.reset_at
|
|
32
|
+
# wait_time = e.reset_at - Time.now
|
|
33
|
+
# sleep(wait_time) if wait_time.positive?
|
|
34
|
+
# end
|
|
35
|
+
#
|
|
36
|
+
class RateLimitError < TransientError
|
|
37
|
+
# Duration in seconds to wait before retrying (from Retry-After header).
|
|
38
|
+
# @return [Integer, nil] Seconds to wait, or nil if header was not present
|
|
39
|
+
attr_reader :retry_after
|
|
40
|
+
|
|
41
|
+
# Timestamp when the rate limit window resets (from X-RateLimit-Reset header).
|
|
42
|
+
# @return [Time, nil] Reset time, or nil if header was not present
|
|
43
|
+
attr_reader :reset_at
|
|
44
|
+
|
|
45
|
+
# Creates a new RateLimitError with optional retry context.
|
|
46
|
+
#
|
|
47
|
+
# @param message [String] Error message describing the rate limit
|
|
48
|
+
# @param retry_after [Integer, nil] Seconds to wait (from Retry-After header)
|
|
49
|
+
# @param reset_at [Time, nil] Timestamp when rate limit resets (from X-RateLimit-Reset header)
|
|
50
|
+
# @param request_id [String, nil] Request correlation ID for tracing
|
|
51
|
+
def initialize(message, retry_after: nil, reset_at: nil, request_id: nil)
|
|
52
|
+
super(message, request_id: request_id)
|
|
53
|
+
@retry_after = retry_after
|
|
54
|
+
@reset_at = reset_at
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Raised when WebSocket reconnection fails after maximum attempts.
|
|
5
|
+
#
|
|
6
|
+
# This is a TransientError (the underlying cause was likely temporary)
|
|
7
|
+
# but after exhausting retries, we give up and surface to the caller.
|
|
8
|
+
#
|
|
9
|
+
# @example Handling reconnection failure
|
|
10
|
+
# begin
|
|
11
|
+
# client.stream.subscribe { |f| process(f) }
|
|
12
|
+
# rescue SecApi::ReconnectionError => e
|
|
13
|
+
# logger.error("Stream failed permanently", attempts: e.attempts)
|
|
14
|
+
# # Fallback to polling via Query API
|
|
15
|
+
# end
|
|
16
|
+
#
|
|
17
|
+
class ReconnectionError < NetworkError
|
|
18
|
+
# @return [Integer] Number of reconnection attempts made
|
|
19
|
+
attr_reader :attempts
|
|
20
|
+
|
|
21
|
+
# @return [Float] Total downtime in seconds
|
|
22
|
+
attr_reader :downtime_seconds
|
|
23
|
+
|
|
24
|
+
# @param message [String] Error message
|
|
25
|
+
# @param attempts [Integer] Number of reconnection attempts made
|
|
26
|
+
# @param downtime_seconds [Float] Total downtime in seconds
|
|
27
|
+
# @param request_id [String, nil] Request correlation ID (optional, WebSocket context may not have one)
|
|
28
|
+
def initialize(message:, attempts:, downtime_seconds:, request_id: nil)
|
|
29
|
+
@attempts = attempts
|
|
30
|
+
@downtime_seconds = downtime_seconds
|
|
31
|
+
super(message, request_id: request_id)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Raised when sec-api.io returns a server error (5xx status code).
|
|
5
|
+
#
|
|
6
|
+
# Why TransientError? Server errors (500, 502, 503, 504) typically indicate
|
|
7
|
+
# temporary infrastructure issues - service restart, brief overload, upstream
|
|
8
|
+
# timeout. The request is valid; the server is temporarily unhealthy. Worth
|
|
9
|
+
# retrying with exponential backoff.
|
|
10
|
+
#
|
|
11
|
+
# This is a transient error - the retry middleware will automatically
|
|
12
|
+
# retry the request. Server errors typically indicate temporary issues
|
|
13
|
+
# with the sec-api.io infrastructure.
|
|
14
|
+
#
|
|
15
|
+
# @example Handling server errors
|
|
16
|
+
# begin
|
|
17
|
+
# client.query.ticker("AAPL").search
|
|
18
|
+
# rescue SecApi::ServerError => e
|
|
19
|
+
# # Retries exhausted - persistent server issue
|
|
20
|
+
# logger.error("Server error: #{e.message}")
|
|
21
|
+
# alert_on_call_team(e)
|
|
22
|
+
# end
|
|
23
|
+
class ServerError < TransientError
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Base class for all retryable (transient) errors.
|
|
5
|
+
#
|
|
6
|
+
# Design rationale: TransientError signals the retry middleware that this failure is worth
|
|
7
|
+
# retrying because the underlying issue may resolve (network blip, brief overload, rate limit
|
|
8
|
+
# window reset). This supports NFR5 (95%+ automatic recovery from transient failures).
|
|
9
|
+
#
|
|
10
|
+
# Retry behavior: The retry middleware uses `error.is_a?(TransientError)` to decide
|
|
11
|
+
# retry eligibility. Subclasses inherit retry eligibility automatically.
|
|
12
|
+
#
|
|
13
|
+
# Transient errors represent temporary failures that may succeed if retried,
|
|
14
|
+
# such as network timeouts, rate limiting, or temporary server issues.
|
|
15
|
+
# The retry middleware automatically retries operations that raise TransientError.
|
|
16
|
+
#
|
|
17
|
+
# @example Catching all transient errors
|
|
18
|
+
# begin
|
|
19
|
+
# client.query.ticker("AAPL").search
|
|
20
|
+
# rescue SecApi::TransientError => e
|
|
21
|
+
# # Auto-retry already attempted (5 times by default)
|
|
22
|
+
# logger.error("Operation failed after retries: #{e.message}")
|
|
23
|
+
# end
|
|
24
|
+
#
|
|
25
|
+
# @see PermanentError for non-retryable errors
|
|
26
|
+
class TransientError < Error
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SecApi
|
|
4
|
+
# Raised when request validation fails (400, 422) or XBRL data integrity issues are detected.
|
|
5
|
+
#
|
|
6
|
+
# Why PermanentError? The client sent invalid data - malformed query, invalid
|
|
7
|
+
# parameters, bad date format. This is a programming error or bad input that
|
|
8
|
+
# won't fix itself. Also raised for XBRL data that fails heuristic validation.
|
|
9
|
+
#
|
|
10
|
+
# This is a permanent error - indicates malformed or incomplete filing data.
|
|
11
|
+
# Retrying won't help; the filing data itself has issues that require investigation.
|
|
12
|
+
#
|
|
13
|
+
# @example Handling validation errors
|
|
14
|
+
# begin
|
|
15
|
+
# xbrl_data = client.xbrl_to_json(accession_no: "0001234567-21-000001")
|
|
16
|
+
# rescue SecApi::ValidationError => e
|
|
17
|
+
# # Report data quality issue
|
|
18
|
+
# logger.error("XBRL validation failed: #{e.message}")
|
|
19
|
+
# report_data_quality_issue(e)
|
|
20
|
+
# end
|
|
21
|
+
class ValidationError < PermanentError
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
module SecApi
|
|
2
|
+
# Extractor proxy for document extraction endpoints
|
|
3
|
+
#
|
|
4
|
+
# All extractor methods return immutable ExtractedData objects (not raw hashes).
|
|
5
|
+
# This ensures thread safety and a consistent API surface.
|
|
6
|
+
#
|
|
7
|
+
# @example Extract text from filing
|
|
8
|
+
# extracted = client.extractor.extract(filing_url)
|
|
9
|
+
# extracted.text # => "Full extracted text..."
|
|
10
|
+
# extracted.sections # => { risk_factors: "...", financials: "..." }
|
|
11
|
+
# extracted.metadata # => { source_url: "...", form_type: "10-K" }
|
|
12
|
+
#
|
|
13
|
+
# @example Extract specific sections
|
|
14
|
+
# extracted = client.extractor.extract(filing_url, sections: [:risk_factors, :mda])
|
|
15
|
+
# extracted.risk_factors # => "Risk factor content..."
|
|
16
|
+
# extracted.mda # => "MD&A content..."
|
|
17
|
+
class Extractor
|
|
18
|
+
# Maps Ruby symbols to SEC item identifiers for 10-K filings
|
|
19
|
+
# @api private
|
|
20
|
+
SECTION_MAP = {
|
|
21
|
+
risk_factors: "1A",
|
|
22
|
+
business: "1",
|
|
23
|
+
mda: "7",
|
|
24
|
+
financials: "8",
|
|
25
|
+
legal_proceedings: "3",
|
|
26
|
+
properties: "2",
|
|
27
|
+
market_risk: "7A"
|
|
28
|
+
}.freeze
|
|
29
|
+
|
|
30
|
+
# Creates a new Extractor proxy instance.
|
|
31
|
+
#
|
|
32
|
+
# Extractor instances are obtained via {Client#extractor} and cached
|
|
33
|
+
# for reuse. Direct instantiation is not recommended.
|
|
34
|
+
#
|
|
35
|
+
# @param client [SecApi::Client] The parent client for API access
|
|
36
|
+
# @return [SecApi::Extractor] A new extractor proxy instance
|
|
37
|
+
# @api private
|
|
38
|
+
def initialize(client)
|
|
39
|
+
@_client = client
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Extract text and sections from SEC filing
|
|
43
|
+
#
|
|
44
|
+
# @param filing [String, Filing] The filing URL string or Filing object
|
|
45
|
+
# @param sections [Array<Symbol>, nil] Specific sections to extract (e.g., [:risk_factors, :mda])
|
|
46
|
+
# When nil or omitted, extracts the full filing text.
|
|
47
|
+
# Supported sections: :risk_factors, :business, :mda, :financials, :legal_proceedings, :properties, :market_risk
|
|
48
|
+
# @param options [Hash] Additional extraction options passed to the API
|
|
49
|
+
# @return [ExtractedData] Immutable extracted data object
|
|
50
|
+
# @raise [AuthenticationError] when API key is invalid
|
|
51
|
+
# @raise [NotFoundError] when filing URL is not found
|
|
52
|
+
# @raise [NetworkError] when connection fails
|
|
53
|
+
# @note When extracting multiple sections, one API call is made per section.
|
|
54
|
+
# This may impact latency and API usage costs for large section lists.
|
|
55
|
+
#
|
|
56
|
+
# @example Extract full filing
|
|
57
|
+
# extracted = client.extractor.extract(filing_url)
|
|
58
|
+
# extracted.text # => "Full filing text..."
|
|
59
|
+
#
|
|
60
|
+
# @example Extract specific section (dynamic accessor)
|
|
61
|
+
# extracted = client.extractor.extract(filing_url, sections: [:risk_factors])
|
|
62
|
+
# extracted.risk_factors # => "Risk factors content..."
|
|
63
|
+
#
|
|
64
|
+
# @example Extract multiple sections (dynamic accessors)
|
|
65
|
+
# extracted = client.extractor.extract(filing_url, sections: [:risk_factors, :mda])
|
|
66
|
+
# extracted.risk_factors # => "Risk factors..."
|
|
67
|
+
# extracted.mda # => "MD&A analysis..."
|
|
68
|
+
def extract(filing, sections: nil, **options)
|
|
69
|
+
url = filing.is_a?(String) ? filing : filing.url
|
|
70
|
+
|
|
71
|
+
if sections.nil? || sections.empty?
|
|
72
|
+
# Default behavior - extract full filing
|
|
73
|
+
response = @_client.connection.post("/extractor", {url: url}.merge(options))
|
|
74
|
+
ExtractedData.from_api(response.body)
|
|
75
|
+
else
|
|
76
|
+
# Extract specified sections
|
|
77
|
+
section_contents = extract_sections(url, Array(sections), options)
|
|
78
|
+
ExtractedData.from_api({sections: section_contents})
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
# Extract multiple sections by making individual API calls
|
|
85
|
+
#
|
|
86
|
+
# @param url [String] The filing URL
|
|
87
|
+
# @param sections [Array<Symbol>] List of sections to extract
|
|
88
|
+
# @param options [Hash] Additional options
|
|
89
|
+
# @return [Hash{Symbol => String}] Hash of section names to content
|
|
90
|
+
def extract_sections(url, sections, options)
|
|
91
|
+
sections.each_with_object({}) do |section, hash|
|
|
92
|
+
item_id = SECTION_MAP[section.to_sym] || section.to_s
|
|
93
|
+
response = @_client.connection.post("/extractor", {
|
|
94
|
+
url: url,
|
|
95
|
+
item: item_id
|
|
96
|
+
}.merge(options))
|
|
97
|
+
|
|
98
|
+
# API returns sections hash or text directly
|
|
99
|
+
content = extract_section_content(response.body, section)
|
|
100
|
+
hash[section.to_sym] = content if content
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Extract section content from API response
|
|
105
|
+
#
|
|
106
|
+
# @param body [Hash, String] The API response body
|
|
107
|
+
# @param section [Symbol] The requested section name
|
|
108
|
+
# @return [String, nil] The section content
|
|
109
|
+
def extract_section_content(body, section)
|
|
110
|
+
return body if body.is_a?(String)
|
|
111
|
+
return nil unless body.is_a?(Hash)
|
|
112
|
+
|
|
113
|
+
# Try sections hash first, then fall back to text
|
|
114
|
+
sections = body[:sections] || body["sections"]
|
|
115
|
+
if sections
|
|
116
|
+
sections[section.to_sym] || sections[section.to_s]
|
|
117
|
+
else
|
|
118
|
+
body[:text] || body["text"]
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|