scraper_utils 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +5 -0
  3. data/CHANGELOG.md +7 -0
  4. data/GUIDELINES.md +2 -1
  5. data/Gemfile +1 -0
  6. data/IMPLEMENTATION.md +40 -0
  7. data/README.md +29 -23
  8. data/SPECS.md +13 -1
  9. data/bin/rspec +27 -0
  10. data/docs/example_scrape_with_fibers.rb +4 -4
  11. data/docs/fibers_and_threads.md +72 -0
  12. data/docs/getting_started.md +6 -6
  13. data/docs/interleaving_requests.md +7 -7
  14. data/docs/parallel_requests.md +138 -0
  15. data/docs/randomizing_requests.md +12 -8
  16. data/docs/reducing_server_load.md +6 -6
  17. data/lib/scraper_utils/data_quality_monitor.rb +2 -3
  18. data/lib/scraper_utils/date_range_utils.rb +37 -78
  19. data/lib/scraper_utils/debug_utils.rb +5 -5
  20. data/lib/scraper_utils/log_utils.rb +15 -0
  21. data/lib/scraper_utils/mechanize_actions.rb +37 -8
  22. data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
  23. data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
  24. data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
  25. data/lib/scraper_utils/mechanize_utils.rb +8 -5
  26. data/lib/scraper_utils/randomize_utils.rb +22 -19
  27. data/lib/scraper_utils/scheduler/constants.rb +12 -0
  28. data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
  29. data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
  30. data/lib/scraper_utils/scheduler/process_request.rb +59 -0
  31. data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
  32. data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
  33. data/lib/scraper_utils/scheduler.rb +286 -0
  34. data/lib/scraper_utils/version.rb +1 -1
  35. data/lib/scraper_utils.rb +11 -14
  36. metadata +16 -6
  37. data/lib/scraper_utils/adaptive_delay.rb +0 -70
  38. data/lib/scraper_utils/fiber_scheduler.rb +0 -229
  39. data/lib/scraper_utils/robots_checker.rb +0 -149
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 28f415290516d19f6ffc95a7d349a6ed269de987a0ffe45ed512ff29bfc82902
4
- data.tar.gz: 9d337d1145754bf2375f4d2e18d89da7c21231d7b9b279bc51340f92781caa35
3
+ metadata.gz: 13ad14102f284c98d658bb928bcf7806ea7594326d11c5426903ebc6b1f919e0
4
+ data.tar.gz: 260cd94a1b76e9851f5af47f716dc754386b081f3923ee0a0eb6fb2b2d086c4f
5
5
  SHA512:
6
- metadata.gz: 8a3050451f512b2f77cf9cd806fc1602d6502b247f24248d93cfc12dea47bf5d7f02bd9be5453c0da7ee7b8acc0e3ee32cd375b52e04f0f366c2174ea7320bd9
7
- data.tar.gz: 17befcb8b9305536385ddf6772aeee038df34bd42ce197c4951836bb396e1db9d3dabece2a543ece6f60e331ca244ef1f660f1b111af17d79703a2a783801183
6
+ metadata.gz: 824b9e64ae7debdf9cddfc90b47de6dff7865e3f655ad6022f58181f38efd06788413e0251525410736e58d6fd325917a8b7a0ad6b2468fa7ab9de3b697955af
7
+ data.tar.gz: fd154118b2eaa22962f4343a3f62ca1daabc19de924a36e4c3cd4f61c9c7bb08a232554f10141b4e3534f3fa2e546a86f860d3bc7997f54d29a067cfb3c4f451
data/.yardopts ADDED
@@ -0,0 +1,5 @@
1
+ --files docs/*.rb
2
+ --files docs/*.md
3
+ --files CHANGELOG.md
4
+ --files LICENSE.txt
5
+ --readme README.md
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.6.0 - 2025-03-16
4
+
5
+ * Add threads for more efficient scraping
6
+ * Adjust defaults for more efficient scraping, retaining just response based delays by default
7
+ * Correct and simplify date range utilities so everything is checked at least `max_period` days
8
+ * Release Candidate for v1.0.0, subject to testing in production
9
+
3
10
  ## 0.5.1 - 2025-03-05
4
11
 
5
12
  * Remove duplicated example code in docs
data/GUIDELINES.md CHANGED
@@ -47,7 +47,8 @@ but if the file is bad, just treat it as missing.
47
47
 
48
48
  ## Testing Strategies
49
49
 
50
- * Avoid mocking unless really needed, instead
50
+ * AVOID mocking unless really needed (and REALLY avoid mocking your own code), instead
51
+ * Consider if you can change your own code, whilst keeping it simple, to make it easier to test
51
52
  * instantiate a real object to use in the test
52
53
  * use mocking facilities provided by the gem (eg Mechanize, Aws etc)
53
54
  * use integration tests with WebMock for simple external sites or VCR for more complex.
data/Gemfile CHANGED
@@ -32,5 +32,6 @@ gem "simplecov", platform && (platform == :heroku16 ? "~> 0.18.0" : "~> 0.22.0")
32
32
  gem "simplecov-console"
33
33
  gem "terminal-table"
34
34
  gem "webmock", platform && (platform == :heroku16 ? "~> 3.14.0" : "~> 3.19.0")
35
+ gem "yard"
35
36
 
36
37
  gemspec
data/IMPLEMENTATION.md CHANGED
@@ -31,3 +31,43 @@ puts "Pre Connect request: #{request.inspect}" if ENV["DEBUG"]
31
31
  - Externalize configuration to improve testability
32
32
  - Keep shared logic in the main class
33
33
  - Decisions / information specific to just one class, can be documented there, otherwise it belongs here
34
+
35
+ ## Testing Directory Structure
36
+
37
+ Our test directory structure reflects various testing strategies and aspects of the codebase:
38
+
39
+ ### API Context Directories
40
+ - `spec/scraper_utils/fiber_api/` - Tests functionality called from within worker fibers
41
+ - `spec/scraper_utils/main_fiber/` - Tests functionality called from the main fiber's perspective
42
+ - `spec/scraper_utils/thread_api/` - Tests functionality called from within worker threads
43
+
44
+ ### Utility Classes
45
+ - `spec/scraper_utils/mechanize_utils/` - Tests for `lib/scraper_utils/mechanize_utils/*.rb` files
46
+ - `spec/scraper_utils/scheduler/` - Tests for `lib/scraper_utils/scheduler/*.rb` files
47
+ - `spec/scraper_utils/scheduler2/` - FIXME: remove duplicate tests and merge to `spec/scraper_utils/scheduler/` unless > 200 lines
48
+
49
+ ### Integration vs Unit Tests
50
+ - `spec/scraper_utils/integration/` - Tests that focus on the integration between components
51
+ - Name tests after the most "parent-like" class of the components involved
52
+
53
+ ### Special Configuration Directories
54
+ These specs check the options we use when things go wrong in production
55
+
56
+ - `spec/scraper_utils/no_threads/` - Tests with threads disabled (`MORPH_DISABLE_THREADS=1`)
57
+ - `spec/scraper_utils/no_fibers/` - Tests with fibers disabled (`MORPH_MAX_WORKERS=0`)
58
+ - `spec/scraper_utils/sequential/` - Tests with exactly one worker (`MORPH_MAX_WORKERS=1`)
59
+
60
+ ### Directories to break up large specs
61
+ Keep specs less than 200 lines long
62
+
63
+ - `spec/scraper_utils/replacements` - Tests for replacements in MechanizeActions
64
+ - `spec/scraper_utils/replacements2` - FIXME: remove duplicate tests and merge to `spec/scraper_utils/replacements/`?
65
+ - `spec/scraper_utils/selectors` - Tests the various node selectors available in MechanizeActions
66
+ - `spec/scraper_utils/selectors2` - FIXME: remove duplicate tests and merge to `spec/scraper_utils/selectors/`?
67
+
68
+ ### General Testing Guidelines
69
+ - Respect fiber and thread context validation - never mock the objects under test
70
+ - Structure tests to run in the appropriate fiber context
71
+ - Use real fibers, threads and operations rather than excessive mocking
72
+ - Ensure proper cleanup of resources in both success and error paths
73
+ - ASK when unsure which (yard doc, spec or code) is wrong as I don't always follow the "write specs first" strategy
data/README.md CHANGED
@@ -9,28 +9,30 @@ For Server Administrators
9
9
  The ScraperUtils library is designed to be a respectful citizen of the web. If you're a server administrator and notice
10
10
  our scraper accessing your systems, here's what you should know:
11
11
 
12
- ### How to Control Our Behavior
13
-
14
- Our scraper utilities respect the standard server **robots.txt** control mechanisms (by default).
15
- To control our access:
16
-
17
- - Add a section for our user agent: `User-agent: ScraperUtils` (default)
18
- - Set a crawl delay, eg: `Crawl-delay: 20`
19
- - If needed specify disallowed paths: `Disallow: /private/`
20
-
21
12
  ### We play nice with your servers
22
13
 
23
14
  Our goal is to access public planning information with minimal impact on your services. The following features are on by
24
15
  default:
25
16
 
17
+ - **Limit server load**:
18
+ - We limit the max load we present to your server to well less than a third of a single cpu
19
+ - The more loaded your server is, the longer we wait between requests
20
+ - We respect Crawl-delay from robots.txt (see section below), so you can tell us an acceptable rate
21
+ - Scarper developers can
22
+ - reduce the max_load we present to your server even lower
23
+ - add random extra delays to give your server a chance to catch up with background tasks
24
+
26
25
  - **Identify themselves**: Our user agent clearly indicates who we are and provides a link to the project repository:
27
26
  `Mozilla/5.0 (compatible; ScraperUtils/0.2.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)`
28
27
 
29
- - **Limit server load**:
30
- - We wait double your response time before making another request to avoid being a significant load on your server
31
- - We also randomly add extra delays to give your server a chance to catch up with background tasks
28
+ ### How to Control Our Behavior
29
+
30
+ Our scraper utilities respect the standard server **robots.txt** control mechanisms (by default).
31
+ To control our access:
32
32
 
33
- We also provide scraper developers other features to reduce overall load as well.
33
+ - Add a section for our user agent: `User-agent: ScraperUtils`
34
+ - Set a crawl delay, eg: `Crawl-delay: 20`
35
+ - If needed specify disallowed paths: `Disallow: /private/`
34
36
 
35
37
  For Scraper Developers
36
38
  ----------------------
@@ -40,14 +42,15 @@ mentioned above.
40
42
 
41
43
  ## Installation & Configuration
42
44
 
43
- Add to your [scraper's](https://www.planningalerts.org.au/how_to_write_a_scraper) Gemfile:
45
+ Add to [your scraper's](https://www.planningalerts.org.au/how_to_write_a_scraper) Gemfile:
44
46
 
45
47
  ```ruby
46
48
  gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
47
49
  gem 'scraper_utils'
48
50
  ```
49
51
 
50
- For detailed setup and configuration options, see the [Getting Started guide](docs/getting_started.md).
52
+ For detailed setup and configuration options,
53
+ see {file:docs/getting_started.md Getting Started guide}
51
54
 
52
55
  ## Key Features
53
56
 
@@ -57,20 +60,23 @@ For detailed setup and configuration options, see the [Getting Started guide](do
57
60
  - Automatic rate limiting based on server response times
58
61
  - Supports robots.txt and crawl-delay directives
59
62
  - Supports extra actions required to get to results page
60
- - [Learn more about Mechanize utilities](docs/mechanize_utilities.md)
63
+ - {file:docs/mechanize_utilities.md Learn more about Mechanize utilities}
61
64
 
62
65
  ### Optimize Server Load
63
66
 
64
67
  - Intelligent date range selection (reduce server load by up to 60%)
65
68
  - Cycle utilities for rotating search parameters
66
- - [Learn more about reducing server load](docs/reducing_server_load.md)
69
+ - {file:docs/reducing_server_load.md Learn more about reducing server load}
67
70
 
68
71
  ### Improve Scraper Efficiency
69
72
 
70
- - Interleave requests to optimize run time
71
- - [Learn more about interleaving requests](docs/interleaving_requests.md)
73
+ - Interleaves requests to optimize run time
74
+ - {file:docs/interleaving_requests.md Learn more about interleaving requests}
75
+ - Use {ScraperUtils::Scheduler.execute_request} so Mechanize network requests will be performed by threads in parallel
76
+ - {file:docs/parallel_requests.md Parallel Request} - see Usage section for installation instructions
72
77
  - Randomize processing order for more natural request patterns
73
- - [Learn more about randomizing requests](docs/randomizing_requests.md)
78
+ - {file:docs/randomizing_requests.md Learn more about randomizing requests} - see Usage section for installation
79
+ instructions
74
80
 
75
81
  ### Error Handling & Quality Monitoring
76
82
 
@@ -82,11 +88,11 @@ For detailed setup and configuration options, see the [Getting Started guide](do
82
88
 
83
89
  - Enhanced debugging utilities
84
90
  - Simple logging with authority context
85
- - [Learn more about debugging](docs/debugging.md)
91
+ - {file:docs/debugging.md Learn more about debugging}
86
92
 
87
93
  ## API Documentation
88
94
 
89
- Complete API documentation is available at [RubyDoc.info](https://rubydoc.info/gems/scraper_utils).
95
+ Complete API documentation is available at [scraper_utils | RubyDoc.info](https://rubydoc.info/gems/scraper_utils).
90
96
 
91
97
  ## Ruby Versions
92
98
 
@@ -105,7 +111,7 @@ To install this gem onto your local machine, run `bundle exec rake install`.
105
111
  ## Contributing
106
112
 
107
113
  Bug reports and pull requests with working tests are welcome
108
- on [GitHub](https://github.com/ianheggie-oaf/scraper_utils).
114
+ on [ianheggie-oaf/scraper_utils | GitHub](https://github.com/ianheggie-oaf/scraper_utils).
109
115
 
110
116
  ## License
111
117
 
data/SPECS.md CHANGED
@@ -6,7 +6,13 @@ installation and usage notes in `README.md`.
6
6
 
7
7
  ASK for clarification of any apparent conflicts with IMPLEMENTATION, GUIDELINES or project instructions.
8
8
 
9
- ## Core Design Principles
9
+ Core Design Principles
10
+ ----------------------
11
+
12
+ ## Coding Style and Complexity
13
+ - KISS (Keep it Simple and Stupid) is a guiding principle:
14
+ - Simple: Design and implement with as little complexity as possible while still achieving the desired functionality
15
+ - Stupid: Should be easy to diagnose and repair with basic tooling
10
16
 
11
17
  ### Error Handling
12
18
  - Record-level errors abort only that record's processing
@@ -23,3 +29,9 @@ ASK for clarification of any apparent conflicts with IMPLEMENTATION, GUIDELINES
23
29
  - Ensure components are independently testable
24
30
  - Avoid timing-based tests in favor of logic validation
25
31
  - Keep test scenarios focused and under 20 lines
32
+
33
+ #### Fiber and Thread Testing
34
+ - Test in appropriate fiber/thread context using API-specific directories
35
+ - Validate cooperative concurrency with real fibers rather than mocks
36
+ - Ensure tests for each context: main fiber, worker fibers, and various thread configurations
37
+ - Test special configurations (no threads, no fibers, sequential) in dedicated directories
data/bin/rspec ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'rspec' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../Gemfile", __dir__)
12
+
13
+ bundle_binstub = File.expand_path("bundle", __dir__)
14
+
15
+ if File.file?(bundle_binstub)
16
+ if File.read(bundle_binstub, 300).include?("This file was generated by Bundler")
17
+ load(bundle_binstub)
18
+ else
19
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
20
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
21
+ end
22
+ end
23
+
24
+ require "rubygems"
25
+ require "bundler/setup"
26
+
27
+ load Gem.bin_path("rspec-core", "rspec")
@@ -3,11 +3,11 @@
3
3
  # Example scrape method updated to use ScraperUtils::FibreScheduler
4
4
 
5
5
  def scrape(authorities, attempt)
6
- ScraperUtils::FiberScheduler.reset!
6
+ ScraperUtils::Scheduler.reset!
7
7
  exceptions = {}
8
8
  authorities.each do |authority_label|
9
- ScraperUtils::FiberScheduler.register_operation(authority_label) do
10
- ScraperUtils::FiberScheduler.log(
9
+ ScraperUtils::Scheduler.register_operation(authority_label) do
10
+ ScraperUtils::LogUtils.log(
11
11
  "Collecting feed data for #{authority_label}, attempt: #{attempt}..."
12
12
  )
13
13
  ScraperUtils::DataQualityMonitor.start_authority(authority_label)
@@ -26,6 +26,6 @@ def scrape(authorities, attempt)
26
26
  end
27
27
  # end of register_operation block
28
28
  end
29
- ScraperUtils::FiberScheduler.run_all
29
+ ScraperUtils::Scheduler.run_operations
30
30
  exceptions
31
31
  end
@@ -0,0 +1,72 @@
1
+ Fibers and Threads
2
+ ==================
3
+
4
+ This sequence diagram supplements the notes on the {ScraperUtils::Scheduler} class and is intended to help show
5
+ the passing of messages and control between the fibers and threads.
6
+
7
+ * To keep things simple I have only shown the Fibers and Threads and not all the other calls like to the
8
+ OperationRegistry to lookup the current operation, or OperationWorker etc.
9
+ * There is ONE (global) response queue, which is monitored by the Scheduler.run_operations loop in the main Fiber
10
+ * Each authority has ONE OperationWorker (not shown), which has ONE Fiber, ONE Thread, ONE request queue.
11
+ * I use "◀─▶" to indicate a call and response, and "║" for which fiber / object is currently running.
12
+
13
+ ```text
14
+
15
+ SCHEDULER (Main Fiber)
16
+ NxRegister-operation RESPONSE.Q
17
+ ║──creates────────◀─▶┐
18
+ ║ │ FIBER (runs block passed to register_operation)
19
+ ║──creates──────────────────◀─▶┐ WORKER object and Registry
20
+ ║──registers(fiber)───────────────────────▶┐ REQUEST-Q
21
+ │ │ │ ║──creates─◀─▶┐ THREAD
22
+ │ │ │ ║──creates───────────◀─▶┐
23
+ ║◀─────────────────────────────────────────┘ ║◀──pop───║ ...[block waiting for request]
24
+ ║ │ │ │ ║ │
25
+ run_operations │ │ │ ║ │
26
+ ║──pop(non block)─◀─▶│ │ │ ║ │ ...[no responses yet]
27
+ ║ │ │ │ ║ │
28
+ ║───resumes-next─"can_resume"─────────────▶┐ ║ │
29
+ │ │ │ ║ ║ │
30
+ │ │ ║◀──resume──┘ ║ │ ...[first Resume passes true]
31
+ │ │ ║ │ ║ │ ...[initialise scraper]
32
+ ```
33
+ **REPEATS FROM HERE**
34
+ ```text
35
+ SCHEDULER RESPONSE.Q FIBER WORKER REQUEST.Q THREAD
36
+ │ │ ║──request─▶┐ ║ │
37
+ │ │ │ ║──push req ─▶║ │
38
+ │ │ ║◀──────────┘ ║──req───▶║
39
+ ║◀──yields control─(waiting)───┘ │ │ ║
40
+ ║ │ │ │ │ ║ ...[Executes network I/O request]
41
+ ║ │ │ │ │ ║
42
+ ║───other-resumes... │ │ │ │ ║ ...[Other Workers will be resumed
43
+ ║ │ │ │ │ ║ till most 99% are waiting on
44
+ ║───lots of │ │ │ │ ║ responses from their threads
45
+ ║ short sleeps ║◀──pushes response───────────────────────────┘
46
+ ║ ║ │ │ ║◀──pop───║ ...[block waiting for request]
47
+ ║──pop(response)──◀─▶║ │ │ ║ │
48
+ ║ │ │ │ ║ │
49
+ ║──saves─response───────────────────────◀─▶│ ║ │
50
+ ║ │ │ │ ║ │
51
+ ║───resumes-next─"can_resume"─────────────▶┐ ║ │
52
+ │ │ │ ║ ║ │
53
+ │ │ ║◀──resume──┘ ║ │ ...[Resume passes response]
54
+ │ │ ║ │ ║ │
55
+ │ │ ║ │ ║ │ ...[Process Response]
56
+ ```
57
+ **REPEATS TO HERE** - WHEN FIBER FINISHES, instead it:
58
+ ```text
59
+ SCHEDULER RESPONSE.Q FIBER WORKER REQUEST.Q THREAD
60
+ │ │ ║ │ ║ │
61
+ │ │ ║─deregister─▶║ ║ │
62
+ │ │ │ ║──close───▶║ │
63
+ │ │ │ ║ ║──nil───▶┐
64
+ │ │ │ ║ │ ║ ... [thread exists]
65
+ │ │ │ ║──join────────────◀─▶┘
66
+ │ │ │ ║ ....... [worker removes
67
+ │ │ │ ║ itself from registry]
68
+ │ │ ║◀──returns───┘
69
+ │◀──returns─nil────────────────┘
70
+ │ │
71
+ ```
72
+ When the last fiber finishes and the registry is empty, then the response queue is also removed
@@ -54,14 +54,14 @@ export DEBUG=1 # for basic, or 2 for verbose or 3 for tracing nearly everything
54
54
 
55
55
  ## Example Scraper Implementation
56
56
 
57
- Update your `scraper.rb` as per [example scraper](example_scraper.rb)
57
+ Update your `scraper.rb` as per {file:example_scraper.rb example scraper}
58
58
 
59
- For more advanced implementations, see the [Interleaving Requests documentation](interleaving_requests.md).
59
+ For more advanced implementations, see the {file:interleaving_requests.md Interleaving Requests documentation}.
60
60
 
61
61
  ## Logging Tables
62
62
 
63
63
  The following logging tables are created for use in monitoring failure patterns and debugging issues.
64
- Records are automaticaly cleared after 30 days.
64
+ Records are automatically cleared after 30 days.
65
65
 
66
66
  The `ScraperUtils::LogUtils.log_scraping_run` call also logs the information to the `scrape_log` table.
67
67
 
@@ -69,6 +69,6 @@ The `ScraperUtils::LogUtils.save_summary_record` call also logs the information
69
69
 
70
70
  ## Next Steps
71
71
 
72
- - [Reducing Server Load](reducing_server_load.md)
73
- - [Mechanize Utilities](mechanize_utilities.md)
74
- - [Debugging](debugging.md)
72
+ - {file:reducing_server_load.md Reducing Server Load}
73
+ - {file:mechanize_utilities.md Mechanize Utilities}
74
+ - {file:debugging.md Debugging}
@@ -1,6 +1,6 @@
1
- # Interleaving Requests with FiberScheduler
1
+ # Interleaving Requests with Scheduler
2
2
 
3
- The `ScraperUtils::FiberScheduler` provides a lightweight utility that:
3
+ The `ScraperUtils::Scheduler` provides a lightweight utility that:
4
4
 
5
5
  * Works on other authorities while in the delay period for an authority's next request
6
6
  * Optimizes the total scraper run time
@@ -12,21 +12,21 @@ The `ScraperUtils::FiberScheduler` provides a lightweight utility that:
12
12
  ## Implementation
13
13
 
14
14
  To enable fiber scheduling, change your scrape method as per
15
- [example scrape with fibers](example_scrape_with_fibers.rb)
15
+ {example_scrape_with_fibers.rb example scrape with fibers}
16
16
 
17
- ## Logging with FiberScheduler
17
+ ## Logging with Scheduler
18
18
 
19
- Use `ScraperUtils::FiberScheduler.log` instead of `puts` when logging within the authority processing code.
19
+ Use {ScraperUtils::LogUtils.log} instead of `puts` when logging within the authority processing code.
20
20
  This will prefix the output lines with the authority name, which is needed since the system will interleave the work and
21
21
  thus the output.
22
22
 
23
23
  ## Testing Considerations
24
24
 
25
- This uses `ScraperUtils::RandomizeUtils` for determining the order of operations. Remember to add the following line to
25
+ This uses {ScraperUtils::RandomizeUtils} for determining the order of operations. Remember to add the following line to
26
26
  `spec/spec_helper.rb`:
27
27
 
28
28
  ```ruby
29
29
  ScraperUtils::RandomizeUtils.sequential = true
30
30
  ```
31
31
 
32
- For full details, see the [FiberScheduler class documentation](https://rubydoc.info/gems/scraper_utils/ScraperUtils/FiberScheduler).
32
+ For full details, see the {Scheduler}.
@@ -0,0 +1,138 @@
1
+ Parallel Request Processing
2
+ ===========================
3
+
4
+ The ScraperUtils library provides a mechanism for executing network I/O requests in parallel using a thread for each
5
+ operation worker, allowing the fiber to yield control and allow other fibers to process whilst the thread processes the
6
+ mechanize network I/O request.
7
+
8
+ This can be disabled by setting `MORPH_DISABLE_THREADS` ENV var to a non-blank value.
9
+
10
+ Overview
11
+ --------
12
+
13
+ When scraping multiple authority websites, around 99% of the time was spent waiting for network I/O. While the
14
+ `Scheduler`
15
+ efficiently interleaves fibers during delay periods, network I/O requests will still block a fiber until they
16
+ complete.
17
+
18
+ The `OperationWorker` optimizes this process by:
19
+
20
+ 1. Executing mechanize network operations in parallel using a thread for each operation_worker and fiber
21
+ 2. Allowing other fibers to continue working while waiting for thread responses
22
+ 3. Integrating seamlessly with the existing `Scheduler`
23
+
24
+ Usage
25
+ -----
26
+
27
+ ```ruby
28
+ # In your authority scraper block
29
+ ScraperUtils::Scheduler.register_operation("authority_name") do
30
+ # Instead of:
31
+ # page = agent.get(url)
32
+
33
+ # Use:
34
+ page = ScraperUtils::Scheduler.execute_request(agent, :get, [url])
35
+
36
+ # Process page as normal
37
+ process_page(page)
38
+ end
39
+ ```
40
+
41
+ For testing purposes, you can also execute non-network operations:
42
+
43
+ ```ruby
44
+ # Create a test object
45
+ test_object = Object.new
46
+
47
+ def test_object.sleep_test(duration)
48
+ sleep(duration)
49
+ "Completed after #{duration} seconds"
50
+ end
51
+
52
+ # Queue a sleep command
53
+ command = ScraperUtils::ProcessRequest.new(
54
+ "test_id",
55
+ test_object,
56
+ :sleep_test,
57
+ [0.5]
58
+ )
59
+
60
+ thread_scheduler.queue_request(command)
61
+ ```
62
+
63
+ Configuration
64
+ -------------
65
+
66
+ The followingENV variables affect how `Scheduler` is configured:
67
+
68
+ * `MORPH_DISABLE_THREADS=1` disabled the use of threads
69
+ * `MORPH_MAX_WORKERS=N` configures the system to a max of N workers (minimum 1).
70
+ If N is 1 then this forces the system to process one authority at a time.
71
+
72
+ Key Components
73
+ --------------
74
+
75
+ ### ThreadRequest
76
+
77
+ A value object encapsulating a command to be executed:
78
+
79
+ - External ID: Any value suitable as a hash key (String, Symbol, Integer, Object) that identifies the command
80
+ - Subject: The object to call the method on
81
+ - Method: The method to call on the subject
82
+ - Args: Arguments to pass to the method
83
+
84
+ ### ThreadResponse
85
+
86
+ A value object encapsulating a response:
87
+
88
+ - External ID: Matches the ID from the original command
89
+ - Result: The result of the operation
90
+ - Error: Any error that occurred
91
+ - Time Taken: Execution time in seconds
92
+
93
+ ### ThreadPool
94
+
95
+ Manages a pool of threads that execute commands:
96
+
97
+ - Processes commands from a queue
98
+ - Returns responses with matching external IDs
99
+ - Provides clear separation between I/O and scheduling
100
+
101
+ Benefits
102
+ --------
103
+
104
+ 1. **Improved Throughput**: Process multiple operations simultaneously
105
+ 2. **Reduced Total Runtime**: Make better use of wait time during network operations
106
+ 3. **Optimal Resource Usage**: Efficiently balance CPU and network operations
107
+ 4. **Better Geolocation Handling**: Distribute requests across proxies more efficiently
108
+ 5. **Testability**: Execute non-network operations for testing concurrency
109
+
110
+ Debugging
111
+ ---------
112
+
113
+ When debugging issues with parallel operations, use:
114
+
115
+ ```shell
116
+ # Set debug level to see request/response logging
117
+ export DEBUG = 2
118
+ ```
119
+
120
+ The system will log:
121
+
122
+ - When commands are queued
123
+ - When responses are received
124
+ - How long each operation took
125
+ - Any errors that occurred
126
+
127
+ ## Implementation Details
128
+
129
+ The integration between `Scheduler` and `ThreadPool` follows these principles:
130
+
131
+ 1. `Scheduler` maintains ownership of all fiber scheduling
132
+ 2. `ThreadPool` only knows about commands and responses
133
+ 3. Communication happens via value objects with validation
134
+ 4. State is managed in dedicated `FiberState` objects
135
+ 5. Each component has a single responsibility
136
+
137
+ This design provides a clean separation of concerns while enabling parallel operations within the existing fiber
138
+ scheduling framework.
@@ -1,9 +1,11 @@
1
- # Randomizing Requests
1
+ Randomizing Requests
2
+ ====================
2
3
 
3
4
  `ScraperUtils::RandomizeUtils` provides utilities for randomizing processing order in scrapers,
4
5
  which is helpful for distributing load and avoiding predictable patterns.
5
6
 
6
- ## Basic Usage
7
+ Usage
8
+ -----
7
9
 
8
10
  Pass a `Collection` or `Array` to `ScraperUtils::RandomizeUtils.randomize_order` to randomize it in production, but
9
11
  receive it as is when testing.
@@ -18,17 +20,19 @@ records.each do |record|
18
20
  end
19
21
  ```
20
22
 
21
- ## Testing Configuration
23
+ Testing Configuration
24
+ ---------------------
22
25
 
23
26
  Enforce sequential mode when testing by adding the following code to `spec/spec_helper.rb`:
24
27
 
25
28
  ```ruby
26
- ScraperUtils::RandomizeUtils.sequential = true
29
+ ScraperUtils::RandomizeUtils.random = false
27
30
  ```
28
31
 
29
- ## Notes
32
+ Notes
33
+ -----
30
34
 
31
- * You can also force sequential mode by setting the env variable `MORPH_PROCESS_SEQUENTIALLY` to `1` (any non-blank value)
32
- * Testing using VCR requires sequential mode
35
+ * You can also disable random mode by setting the env variable `MORPH_DISABLE_RANDOM` to `1` (or any non-blank value)
36
+ * Testing using VCR requires random to be disabled
33
37
 
34
- For full details, see the [RandomizeUtils class documentation](https://rubydoc.info/gems/scraper_utils/ScraperUtils/RandomizeUtils).
38
+ For full details, see {ScraperUtils::RandomizeUtils Randomize Utils class documentation}
@@ -12,8 +12,8 @@ records:
12
12
 
13
13
  - Always checks the most recent 4 days daily (configurable)
14
14
  - Progressively reduces search frequency for older records
15
- - Uses a Fibonacci-like progression to create natural, efficient search intervals
16
- - Configurable `max_period` (default is 3 days)
15
+ - Uses a progression from each 2 days and upwards to create an efficient search intervals
16
+ - Configurable `max_period` (default is 2 days)
17
17
  - Merges adjacent search ranges and handles the changeover in search frequency by extending some searches
18
18
 
19
19
  Example usage in your scraper:
@@ -28,11 +28,11 @@ date_ranges.each do |from_date, to_date, _debugging_comment|
28
28
  end
29
29
  ```
30
30
 
31
- Typical server load reductions:
31
+ Typical server load compared to search all days each time:
32
32
 
33
- * Max period 2 days : ~42% of the 33 days selected
34
- * Max period 3 days : ~37% of the 33 days selected (default)
35
- * Max period 5 days : ~35% (or ~31% when days = 45)
33
+ * Max period 2 days : ~59% of the 33 days selected (default, alternates between 57% and 61% covered)
34
+ * Max period 3 days : ~50% of the 33 days selected (varies much more - between 33 and 67%)
35
+ * Max period 4 days : ~46% (more efficient if you search back 50 or more days, varies between 15 and 61%)
36
36
 
37
37
  See the [DateRangeUtils class documentation](https://rubydoc.info/gems/scraper_utils/ScraperUtils/DateRangeUtils) for customizing defaults and passing options.
38
38
 
@@ -13,7 +13,6 @@ module ScraperUtils
13
13
  # Notes the start of processing an authority and clears any previous stats
14
14
  #
15
15
  # @param authority_label [Symbol] The authority we are processing
16
- # @return [void]
17
16
  def self.start_authority(authority_label)
18
17
  @stats ||= {}
19
18
  @stats[authority_label] = { saved: 0, unprocessed: 0 }
@@ -41,7 +40,7 @@ module ScraperUtils
41
40
  def self.log_unprocessable_record(exception, record)
42
41
  authority_label = extract_authority(record)
43
42
  @stats[authority_label][:unprocessed] += 1
44
- ScraperUtils::FiberScheduler.log "Erroneous record #{authority_label} - #{record&.fetch(
43
+ ScraperUtils::LogUtils.log "Erroneous record #{authority_label} - #{record&.fetch(
45
44
  'address', nil
46
45
  ) || record.inspect}: #{exception}"
47
46
  return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
@@ -58,7 +57,7 @@ module ScraperUtils
58
57
  def self.log_saved_record(record)
59
58
  authority_label = extract_authority(record)
60
59
  @stats[authority_label][:saved] += 1
61
- ScraperUtils::FiberScheduler.log "Saving record #{authority_label} - #{record['address']}"
60
+ ScraperUtils::LogUtils.log "Saving record #{authority_label} - #{record['address']}"
62
61
  end
63
62
  end
64
63
  end