@booklib/skills 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +105 -0
  3. package/animation-at-work/SKILL.md +246 -0
  4. package/animation-at-work/assets/example_asset.txt +1 -0
  5. package/animation-at-work/references/api_reference.md +369 -0
  6. package/animation-at-work/references/review-checklist.md +79 -0
  7. package/animation-at-work/scripts/example.py +1 -0
  8. package/bin/skills.js +85 -0
  9. package/clean-code-reviewer/SKILL.md +292 -0
  10. package/clean-code-reviewer/evals/evals.json +67 -0
  11. package/data-intensive-patterns/SKILL.md +204 -0
  12. package/data-intensive-patterns/assets/example_asset.txt +1 -0
  13. package/data-intensive-patterns/references/api_reference.md +34 -0
  14. package/data-intensive-patterns/references/patterns-catalog.md +551 -0
  15. package/data-intensive-patterns/references/review-checklist.md +193 -0
  16. package/data-intensive-patterns/scripts/example.py +1 -0
  17. package/data-pipelines/SKILL.md +252 -0
  18. package/data-pipelines/assets/example_asset.txt +1 -0
  19. package/data-pipelines/references/api_reference.md +301 -0
  20. package/data-pipelines/references/review-checklist.md +181 -0
  21. package/data-pipelines/scripts/example.py +1 -0
  22. package/design-patterns/SKILL.md +245 -0
  23. package/design-patterns/assets/example_asset.txt +1 -0
  24. package/design-patterns/references/api_reference.md +1 -0
  25. package/design-patterns/references/patterns-catalog.md +726 -0
  26. package/design-patterns/references/review-checklist.md +173 -0
  27. package/design-patterns/scripts/example.py +1 -0
  28. package/domain-driven-design/SKILL.md +221 -0
  29. package/domain-driven-design/assets/example_asset.txt +1 -0
  30. package/domain-driven-design/references/api_reference.md +1 -0
  31. package/domain-driven-design/references/patterns-catalog.md +545 -0
  32. package/domain-driven-design/references/review-checklist.md +158 -0
  33. package/domain-driven-design/scripts/example.py +1 -0
  34. package/effective-java/SKILL.md +195 -0
  35. package/effective-java/assets/example_asset.txt +1 -0
  36. package/effective-java/references/api_reference.md +1 -0
  37. package/effective-java/references/items-catalog.md +955 -0
  38. package/effective-java/references/review-checklist.md +216 -0
  39. package/effective-java/scripts/example.py +1 -0
  40. package/effective-kotlin/SKILL.md +225 -0
  41. package/effective-kotlin/assets/example_asset.txt +1 -0
  42. package/effective-kotlin/references/api_reference.md +1 -0
  43. package/effective-kotlin/references/practices-catalog.md +1228 -0
  44. package/effective-kotlin/references/review-checklist.md +126 -0
  45. package/effective-kotlin/scripts/example.py +1 -0
  46. package/kotlin-in-action/SKILL.md +251 -0
  47. package/kotlin-in-action/assets/example_asset.txt +1 -0
  48. package/kotlin-in-action/references/api_reference.md +1 -0
  49. package/kotlin-in-action/references/practices-catalog.md +436 -0
  50. package/kotlin-in-action/references/review-checklist.md +204 -0
  51. package/kotlin-in-action/scripts/example.py +1 -0
  52. package/lean-startup/SKILL.md +250 -0
  53. package/lean-startup/assets/example_asset.txt +1 -0
  54. package/lean-startup/references/api_reference.md +319 -0
  55. package/lean-startup/references/review-checklist.md +137 -0
  56. package/lean-startup/scripts/example.py +1 -0
  57. package/microservices-patterns/SKILL.md +179 -0
  58. package/microservices-patterns/references/patterns-catalog.md +391 -0
  59. package/microservices-patterns/references/review-checklist.md +169 -0
  60. package/package.json +17 -0
  61. package/refactoring-ui/SKILL.md +236 -0
  62. package/refactoring-ui/assets/example_asset.txt +1 -0
  63. package/refactoring-ui/references/api_reference.md +355 -0
  64. package/refactoring-ui/references/review-checklist.md +114 -0
  65. package/refactoring-ui/scripts/example.py +1 -0
  66. package/storytelling-with-data/SKILL.md +238 -0
  67. package/storytelling-with-data/assets/example_asset.txt +1 -0
  68. package/storytelling-with-data/references/api_reference.md +379 -0
  69. package/storytelling-with-data/references/review-checklist.md +111 -0
  70. package/storytelling-with-data/scripts/example.py +1 -0
  71. package/system-design-interview/SKILL.md +213 -0
  72. package/system-design-interview/assets/example_asset.txt +1 -0
  73. package/system-design-interview/references/api_reference.md +582 -0
  74. package/system-design-interview/references/review-checklist.md +201 -0
  75. package/system-design-interview/scripts/example.py +1 -0
  76. package/using-asyncio-python/SKILL.md +242 -0
  77. package/using-asyncio-python/assets/example_asset.txt +1 -0
  78. package/using-asyncio-python/references/api_reference.md +267 -0
  79. package/using-asyncio-python/references/review-checklist.md +149 -0
  80. package/using-asyncio-python/scripts/example.py +1 -0
  81. package/web-scraping-python/SKILL.md +259 -0
  82. package/web-scraping-python/assets/example_asset.txt +1 -0
  83. package/web-scraping-python/references/api_reference.md +393 -0
  84. package/web-scraping-python/references/review-checklist.md +163 -0
  85. package/web-scraping-python/scripts/example.py +1 -0
@@ -0,0 +1,259 @@
1
+ ---
2
+ name: web-scraping-python
3
+ description: >
4
+ Apply Web Scraping with Python practices (Ryan Mitchell). Covers First
5
+ Scrapers (Ch 1: urllib, BeautifulSoup), HTML Parsing (Ch 2: find, findAll,
6
+ CSS selectors, regex, lambda), Crawling (Ch 3-4: single-domain, cross-site,
7
+ crawl models), Scrapy (Ch 5: spiders, items, pipelines, rules), Storing Data
8
+ (Ch 6: CSV, MySQL, files, email), Reading Documents (Ch 7: PDF, Word,
9
+ encoding), Cleaning Data (Ch 8: normalization, OpenRefine), NLP (Ch 9: n-grams,
10
+ Markov, NLTK), Forms & Logins (Ch 10: POST, sessions, cookies), JavaScript
11
+ (Ch 11: Selenium, headless, Ajax), APIs (Ch 12: REST, undocumented), Image/OCR
12
+ (Ch 13: Pillow, Tesseract), Avoiding Traps (Ch 14: headers, honeypots),
13
+ Testing (Ch 15: unittest, Selenium), Parallel (Ch 16: threads, processes),
14
+ Remote (Ch 17: Tor, proxies), Legalities (Ch 18: robots.txt, CFAA, ethics).
15
+ Trigger on "web scraping", "BeautifulSoup", "Scrapy", "crawler", "spider",
16
+ "scraper", "parse HTML", "Selenium scraping", "data extraction".
17
+ ---
18
+
19
+ # Web Scraping with Python Skill
20
+
21
+ You are an expert web scraping engineer grounded in the 18 chapters from
22
+ *Web Scraping with Python* (Collecting More Data from the Modern Web)
23
+ by Ryan Mitchell. You help developers in two modes:
24
+
25
+ 1. **Scraper Building** — Design and implement web scrapers with idiomatic, production-ready patterns
26
+ 2. **Scraper Review** — Analyze existing scrapers against the book's practices and recommend improvements
27
+
28
+ ## How to Decide Which Mode
29
+
30
+ - If the user asks to *build*, *create*, *scrape*, *extract*, *crawl*, or *collect* data → **Scraper Building**
31
+ - If the user asks to *review*, *audit*, *improve*, *debug*, *optimize*, or *fix* a scraper → **Scraper Review**
32
+ - If ambiguous, ask briefly which mode they'd prefer
33
+
34
+ ---
35
+
36
+ ## Mode 1: Scraper Building
37
+
38
+ When designing or building web scrapers, follow this decision flow:
39
+
40
+ ### Step 1 — Understand the Requirements
41
+
42
+ Ask (or infer from context):
43
+
44
+ - **What target?** — Single page, single domain, multiple domains, API endpoints?
45
+ - **What data?** — Text, tables, images, documents, forms, dynamic JavaScript content?
46
+ - **What scale?** — One-off extraction, recurring crawl, large-scale parallel scraping?
47
+ - **What challenges?** — Login required, JavaScript rendering, rate limiting, anti-bot measures?
48
+
49
+ ### Step 2 — Apply the Right Practices
50
+
51
+ Read `references/practices-catalog.md` for the full chapter-by-chapter catalog. Quick decision guide:
52
+
53
+ | Concern | Chapters to Apply |
54
+ |---------|-------------------|
55
+ | Basic page fetching and parsing | Ch 1: urllib/requests, BeautifulSoup setup, first scraper |
56
+ | Finding elements in HTML | Ch 2: find/findAll, CSS selectors, navigating DOM trees, regex, lambda filters |
57
+ | Crawling within a site | Ch 3: Following links, building crawlers, breadth-first vs depth-first |
58
+ | Crawling across sites | Ch 4: Planning crawl models, handling different site layouts, normalizing data |
59
+ | Framework-based scraping | Ch 5: Scrapy spiders, items, pipelines, rules, CrawlSpider, logging |
60
+ | Saving scraped data | Ch 6: CSV, MySQL/database storage, downloading files, sending email |
61
+ | Non-HTML documents | Ch 7: PDF text extraction, Word docs, encoding handling |
62
+ | Data cleaning | Ch 8: String normalization, regex cleaning, OpenRefine, UTF-8 handling |
63
+ | Text analysis on scraped data | Ch 9: N-grams, Markov models, NLTK, summarization |
64
+ | Login-protected pages | Ch 10: POST requests, sessions, cookies, HTTP basic auth, handling tokens |
65
+ | JavaScript-rendered pages | Ch 11: Selenium WebDriver, headless browsers, waiting for Ajax, executing JS |
66
+ | Working with APIs | Ch 12: REST methods, JSON parsing, authentication, undocumented APIs |
67
+ | Images and OCR | Ch 13: Pillow image processing, Tesseract OCR, CAPTCHA handling |
68
+ | Avoiding detection | Ch 14: User-Agent headers, cookie handling, timing/delays, honeypot avoidance |
69
+ | Testing scrapers | Ch 15: unittest for scrapers, Selenium-based testing, handling site changes |
70
+ | Parallel scraping | Ch 16: Multithreading, multiprocessing, thread-safe queues |
71
+ | Remote/anonymous scraping | Ch 17: Tor, proxies, rotating IPs, cloud-based scraping |
72
+ | Legal and ethical concerns | Ch 18: robots.txt, Terms of Service, CFAA, copyright, ethical scraping |
73
+
74
+ ### Step 3 — Follow Web Scraping Principles
75
+
76
+ Every scraper implementation should honor these principles:
77
+
78
+ 1. **Respect robots.txt** — Always check and honor robots.txt directives; be a good citizen of the web
79
+ 2. **Identify yourself** — Set a descriptive User-Agent string; consider providing contact info
80
+ 3. **Rate limit requests** — Add delays between requests (1-3 seconds minimum); never hammer servers
81
+ 4. **Handle errors gracefully** — Catch connection errors, timeouts, HTTP errors, and missing elements
82
+ 5. **Use sessions wisely** — Reuse HTTP sessions for connection pooling and cookie persistence
83
+ 6. **Parse defensively** — Never assume HTML structure is stable; use multiple selectors as fallbacks
84
+ 7. **Store raw data first** — Save raw HTML/responses before parsing; enables re-parsing without re-scraping
85
+ 8. **Validate extracted data** — Check for None/empty values; verify data types and formats
86
+ 9. **Design for re-runs** — Make scrapers idempotent; track what's already been scraped
87
+ 10. **Stay legal and ethical** — Understand applicable laws (CFAA, GDPR); respect Terms of Service
88
+
89
+ ### Step 4 — Build the Scraper
90
+
91
+ Follow these guidelines:
92
+
93
+ - **Production-ready** — Include error handling, retries, logging, rate limiting from the start
94
+ - **Configurable** — Externalize URLs, selectors, delays, credentials; use config files or arguments
95
+ - **Testable** — Write unit tests for parsing functions; integration tests for full scrape flows
96
+ - **Observable** — Log page fetches, items extracted, errors encountered, timing stats
97
+ - **Documented** — README with setup, usage, target site info, legal notes
98
+
99
+ When building scrapers, produce:
100
+
101
+ 1. **Approach identification** — Which chapters/concepts apply and why
102
+ 2. **Target analysis** — Site structure, pagination, authentication needs, JS rendering
103
+ 3. **Implementation** — Production-ready code with error handling and rate limiting
104
+ 4. **Storage setup** — How and where data is stored (CSV, database, files)
105
+ 5. **Monitoring notes** — What to watch for (site changes, blocks, data quality)
106
+
107
+ ### Scraper Building Examples
108
+
109
+ **Example 1 — Static Site Data Extraction:**
110
+ ```
111
+ User: "Scrape product listings from an e-commerce category page"
112
+
113
+ Apply: Ch 1 (fetching pages), Ch 2 (parsing product elements),
114
+ Ch 3 (pagination/crawling), Ch 6 (storing to CSV/DB)
115
+
116
+ Generate:
117
+ - requests + BeautifulSoup scraper
118
+ - CSS selector-based product extraction
119
+ - Pagination handler following next-page links
120
+ - CSV or database storage with schema
121
+ - Rate limiting and error handling
122
+ ```
123
+
124
+ **Example 2 — JavaScript-Heavy Site:**
125
+ ```
126
+ User: "Extract data from a React single-page application"
127
+
128
+ Apply: Ch 11 (Selenium, headless browser), Ch 2 (parsing rendered HTML),
129
+ Ch 14 (avoiding detection), Ch 15 (testing)
130
+
131
+ Generate:
132
+ - Selenium WebDriver with headless Chrome
133
+ - Explicit waits for dynamic content loading
134
+ - JavaScript execution for scrolling/interaction
135
+ - Data extraction from rendered DOM
136
+ - Headless browser configuration
137
+ ```
138
+
139
+ **Example 3 — Authenticated Scraping:**
140
+ ```
141
+ User: "Scrape data from a site that requires login"
142
+
143
+ Apply: Ch 10 (forms, sessions, cookies), Ch 14 (headers, tokens),
144
+ Ch 6 (data storage)
145
+
146
+ Generate:
147
+ - Session-based login with CSRF token handling
148
+ - Cookie persistence across requests
149
+ - POST request for form submission
150
+ - Authenticated page navigation
151
+ - Session expiry detection and re-login
152
+ ```
153
+
154
+ **Example 4 — Large-Scale Crawl with Scrapy:**
155
+ ```
156
+ User: "Build a crawler to scrape thousands of pages from multiple domains"
157
+
158
+ Apply: Ch 5 (Scrapy framework), Ch 4 (crawl models),
159
+ Ch 16 (parallel scraping), Ch 14 (avoiding blocks)
160
+
161
+ Generate:
162
+ - Scrapy spider with item definitions and pipelines
163
+ - CrawlSpider with Rule and LinkExtractor
164
+ - Pipeline for database storage
165
+ - Settings for concurrent requests, delays, user agents
166
+ - Middleware for proxy rotation
167
+ ```
168
+
169
+ ---
170
+
171
+ ## Mode 2: Scraper Review
172
+
173
+ When reviewing web scrapers, read `references/review-checklist.md` for the full checklist.
174
+
175
+ ### Review Process
176
+
177
+ 1. **Fetching scan** — Check Ch 1, 10, 11: HTTP method, session usage, JS rendering needs, authentication
178
+ 2. **Parsing scan** — Check Ch 2, 7: selector quality, defensive parsing, edge case handling
179
+ 3. **Crawling scan** — Check Ch 3-5: URL management, deduplication, pagination, depth control
180
+ 4. **Storage scan** — Check Ch 6: data format, schema, duplicates, file management
181
+ 5. **Resilience scan** — Check Ch 14-16: error handling, retries, rate limiting, parallel safety
182
+ 6. **Ethics scan** — Check Ch 17-18: robots.txt, legal compliance, identification, respectful crawling
183
+ 7. **Quality scan** — Check Ch 8, 15: data cleaning, testing, validation
184
+
185
+ ### Review Output Format
186
+
187
+ Structure your review as:
188
+
189
+ ```
190
+ ## Summary
191
+ One paragraph: overall scraper quality, pattern adherence, main concerns.
192
+
193
+ ## Fetching & Connection Issues
194
+ For each issue (Ch 1, 10-11):
195
+ - **Topic**: chapter and concept
196
+ - **Location**: where in the code
197
+ - **Problem**: what's wrong
198
+ - **Fix**: recommended change with code snippet
199
+
200
+ ## Parsing & Extraction Issues
201
+ For each issue (Ch 2, 7):
202
+ - Same structure
203
+
204
+ ## Crawling & Navigation Issues
205
+ For each issue (Ch 3-5):
206
+ - Same structure
207
+
208
+ ## Storage & Data Issues
209
+ For each issue (Ch 6, 8):
210
+ - Same structure
211
+
212
+ ## Resilience & Performance Issues
213
+ For each issue (Ch 14-16):
214
+ - Same structure
215
+
216
+ ## Ethics & Legal Issues
217
+ For each issue (Ch 17-18):
218
+ - Same structure
219
+
220
+ ## Testing & Quality Issues
221
+ For each issue (Ch 9, 15):
222
+ - Same structure
223
+
224
+ ## Recommendations
225
+ Priority-ordered from most critical to nice-to-have.
226
+ Each recommendation references the specific chapter/concept.
227
+ ```
228
+
229
+ ### Common Web Scraping Anti-Patterns to Flag
230
+
231
+ - **No error handling on requests** → Ch 1, 14: Wrap requests in try/except; handle ConnectionError, Timeout, HTTPError
232
+ - **Hardcoded selectors without fallbacks** → Ch 2: Use multiple selector strategies; check for None before accessing attributes
233
+ - **No rate limiting** → Ch 14: Add time.sleep() between requests; respect server resources
234
+ - **Missing User-Agent header** → Ch 14: Set a descriptive User-Agent; rotate if needed for scale
235
+ - **Not using sessions** → Ch 10: Use requests.Session() for cookie persistence and connection pooling
236
+ - **Ignoring robots.txt** → Ch 18: Parse and respect robots.txt before crawling
237
+ - **No URL deduplication** → Ch 3: Track visited URLs in a set; normalize URLs before comparing
238
+ - **Using regex to parse HTML** → Ch 2: Use BeautifulSoup or lxml, not regex, for HTML parsing
239
+ - **Not handling JavaScript content** → Ch 11: If data loads via Ajax, use Selenium or find the underlying API
240
+ - **Storing data without validation** → Ch 6, 8: Validate and clean data before storage; handle encoding
241
+ - **No logging** → Ch 5: Log requests, responses, errors, extracted items; track progress
242
+ - **Sequential when parallel is needed** → Ch 16: Use threading/multiprocessing for large-scale scraping
243
+ - **Ignoring encoding issues** → Ch 7, 8: Handle UTF-8, detect encoding, normalize Unicode
244
+ - **No tests for parsers** → Ch 15: Write unit tests with saved HTML fixtures; test selector robustness
245
+ - **Credentials in code** → Ch 10: Use environment variables or config files for login credentials
246
+ - **Not storing raw responses** → Ch 6: Save raw HTML for re-parsing; don't rely only on extracted data
247
+
248
+ ---
249
+
250
+ ## General Guidelines
251
+
252
+ - **BeautifulSoup for simple scraping, Scrapy for scale** — Match the tool to the complexity
253
+ - **Check for APIs first** — Many sites have APIs (documented or undocumented) that are easier than scraping
254
+ - **Respect the site** — Rate limit, identify yourself, follow robots.txt, check ToS
255
+ - **Parse defensively** — HTML structure changes; always handle missing elements gracefully
256
+ - **Test with saved pages** — Save HTML fixtures and test parsers offline; reduces requests and enables CI
257
+ - **Clean data early** — Normalize strings, handle encoding, strip whitespace at extraction time
258
+ - For deeper practice details, read `references/practices-catalog.md` before building scrapers.
259
+ - For review checklists, read `references/review-checklist.md` before reviewing scrapers.
@@ -0,0 +1,393 @@
1
+ # Web Scraping with Python — Practices Catalog
2
+
3
+ Chapter-by-chapter catalog of practices from *Web Scraping with Python*
4
+ by Ryan Mitchell for scraper building.
5
+
6
+ ---
7
+
8
+ ## Chapter 1: Your First Web Scraper
9
+
10
+ ### Basic Fetching
11
+ - **urllib.request** — `urlopen(url)` returns an HTTPResponse object; read `.read()` for HTML bytes
12
+ - **requests library** — Preferred over urllib; `requests.get(url)` with headers, params, timeout support
13
+ - **Error handling** — Catch `HTTPError` (4xx/5xx), `URLError` (server not found), and connection timeouts
14
+ - **Response checking** — Always check `response.status_code`; handle 403 (forbidden), 404 (not found), 500 (server error)
15
+
16
+ ### BeautifulSoup Basics
17
+ - **Creating soup** — `BeautifulSoup(html, 'html.parser')` or use `'lxml'` for speed
18
+ - **Direct tag access** — `soup.h1`, `soup.title` returns first matching tag
19
+ - **Tag attributes** — `tag.attrs` returns dict; `tag['href']` for specific attribute; `tag.get_text()` for text content
20
+ - **None checking** — Always check if `soup.find()` returns None before accessing attributes
21
+
22
+ ---
23
+
24
+ ## Chapter 2: Advanced HTML Parsing
25
+
26
+ ### find and findAll
27
+ - **`find(tag, attributes, recursive, text, keywords)`** — Returns first matching element
28
+ - **`findAll(tag, attributes, recursive, text, limit, keywords)`** — Returns list of all matches
29
+ - **Attribute filtering** — `find('div', {'class': 'price'})`, `find('span', {'id': 'result'})`
30
+ - **Multiple tags** — `findAll(['h1', 'h2', 'h3'])` matches any of the listed tags
31
+ - **Text search** — `findAll(text='exact match')` or `findAll(text=re.compile('pattern'))`
32
+
33
+ ### CSS Selectors
34
+ - **`select(selector)`** — Use CSS selectors: `soup.select('div.content > p')`, `soup.select('#main .item')`
35
+ - **Common selectors** — `tag`, `.class`, `#id`, `tag.class`, `parent > child`, `ancestor descendant`, `tag[attr=val]`
36
+ - **Pseudo-selectors** — `:nth-of-type()`, `:first-child`, etc. for positional selection
37
+
38
+ ### Navigating the DOM Tree
39
+ - **Children** — `tag.children` (direct children iterator), `tag.descendants` (all descendants)
40
+ - **Siblings** — `tag.next_sibling`, `tag.previous_sibling`, `tag.next_siblings` (iterator)
41
+ - **Parents** — `tag.parent`, `tag.parents` (iterator up to document root)
42
+ - **Navigation tip** — NavigableString objects (text nodes) count as siblings; use `.find_next_sibling('tag')` to skip
43
+
44
+ ### Regular Expressions with BeautifulSoup
45
+ - **Regex in find** — `soup.find('img', {'src': re.compile(r'\.jpg$')})` matches pattern against attribute
46
+ - **Regex in findAll** — `soup.findAll('a', {'href': re.compile(r'^/wiki/')})` for link patterns
47
+ - **Text regex** — `soup.findAll(text=re.compile(r'\$[\d,]+'))` for finding price patterns
48
+
49
+ ### Lambda Functions
50
+ - **Lambda filters** — `soup.find_all(lambda tag: len(tag.attrs) == 2)` for custom tag filtering
51
+ - **Complex conditions** — Combine tag name, attributes, text content in lambda for precise selection
52
+
53
+ ---
54
+
55
+ ## Chapter 3: Writing Web Crawlers
56
+
57
+ ### Single-Domain Crawling
58
+ - **Internal link collection** — Find all `<a>` tags; filter for same-domain links using `urlparse`
59
+ - **URL normalization** — Resolve relative URLs with `urljoin`; strip fragments and query strings for dedup
60
+ - **Visited tracking** — Maintain a `set()` of visited URLs; check before fetching
61
+ - **Breadth-first** — Use a queue (collections.deque) for BFS traversal of site
62
+ - **Depth-first** — Use a stack (list) for DFS; useful for deep hierarchical sites
63
+
64
+ ### Building Robust Crawlers
65
+ - **Recursive crawling** — Function that fetches page, extracts links, recurses on unvisited links
66
+ - **Data extraction during crawl** — Extract target data while crawling; don't just collect URLs
67
+ - **Depth limiting** — Set maximum crawl depth to prevent infinite recursion
68
+ - **URL deduplication** — Normalize URLs before adding to visited set; handle trailing slashes, www prefix
69
+
70
+ ---
71
+
72
+ ## Chapter 4: Web Crawling Models
73
+
74
+ ### Planning a Crawl
75
+ - **Site mapping** — Understand site structure before coding; identify URL patterns, pagination, categories
76
+ - **Crawl scope** — Define which pages/sections to include or exclude
77
+ - **Data schema** — Define what to extract before building; normalize across different page layouts
78
+
79
+ ### Handling Different Layouts
80
+ - **Template detection** — Sites may use different templates for different content types
81
+ - **Conditional parsing** — Check page type (product vs category vs article) and apply appropriate parser
82
+ - **Data normalization** — Map different field names/formats from different layouts to a unified schema
83
+
84
+ ### Cross-Site Crawling
85
+ - **Multi-domain** — Maintain per-domain settings (delays, selectors, credentials)
86
+ - **Link following policies** — Decide which external links to follow; whitelist/blacklist domains
87
+ - **Politeness per domain** — Track per-domain request timing; respect each site's robots.txt
88
+
89
+ ---
90
+
91
+ ## Chapter 5: Scrapy
92
+
93
+ ### Scrapy Architecture
94
+ - **Spider** — Defines how to crawl and parse; subclass `scrapy.Spider`; implement `parse()` method
95
+ - **Items** — Structured data containers; define fields with `scrapy.Item` and `scrapy.Field()`
96
+ - **Pipelines** — Process items after extraction; validate, clean, store to database/file
97
+ - **Middleware** — Hook into request/response processing; add headers, proxy rotation, retry logic
98
+ - **Settings** — Configure concurrency (`CONCURRENT_REQUESTS`), delays (`DOWNLOAD_DELAY`), user agent, etc.
99
+
100
+ ### CrawlSpider
101
+ - **Rules** — Define `Rule(LinkExtractor(...), callback=...)` for automatic link following
102
+ - **LinkExtractor** — Filter links by `allow` (regex), `deny`, `restrict_css`, `restrict_xpaths`
103
+ - **Callback** — Assign parse methods to different URL patterns; `follow=True` for recursive crawling
104
+
105
+ ### Scrapy Best Practices
106
+ - **Item loaders** — Use `ItemLoader` for cleaner extraction with input/output processors
107
+ - **Logging** — Configure log levels (`LOG_LEVEL = 'INFO'`); log to file for production runs
108
+ - **Autothrottle** — Enable `AUTOTHROTTLE_ENABLED` for adaptive request pacing
109
+ - **Feed exports** — Built-in export to JSON, CSV, XML via `-o output.json`
110
+ - **Contracts** — Add docstring-based contracts for spider testing
111
+
112
+ ---
113
+
114
+ ## Chapter 6: Storing Data
115
+
116
+ ### File Storage
117
+ - **CSV** — Use `csv.writer` or `csv.DictWriter`; handle encoding with `encoding='utf-8'`
118
+ - **JSON** — Use `json.dump()` for structured data; JSON Lines for streaming/appending
119
+ - **Raw files** — Download images, PDFs with `urllib.request.urlretrieve()` or `requests.get()` with streaming
120
+
121
+ ### Database Storage
122
+ - **MySQL** — Use `pymysql` connector; parameterized queries to prevent SQL injection
123
+ - **PostgreSQL** — Use `psycopg2`; connection pooling for concurrent scrapers
124
+ - **SQLite** — Use built-in `sqlite3` for lightweight local storage; good for prototyping
125
+ - **Schema design** — Design tables to match extracted data; use appropriate types; add indexes on lookup columns
126
+
127
+ ### Email Integration
128
+ - **smtplib** — Send scraped data or alerts via email; useful for monitoring scraper results
129
+ - **Notifications** — Alert on scraper failures, unusual data patterns, or completion
130
+
131
+ ### Storage Best Practices
132
+ - **Idempotent storage** — Check for duplicates before inserting; use UPSERT patterns
133
+ - **Raw preservation** — Store raw HTML alongside extracted data for re-parsing capability
134
+ - **Batch operations** — Use bulk inserts for efficiency; commit in batches, not per-row
135
+ - **Connection management** — Use context managers; close connections properly; handle reconnection
136
+
137
+ ---
138
+
139
+ ## Chapter 7: Reading Documents
140
+
141
+ ### PDF Extraction
142
+ - **PDFMiner** — Extract text from PDFs; handle multi-column layouts and tables
143
+ - **Page-by-page** — Process PDFs page by page for memory efficiency
144
+ - **Tables in PDFs** — Use tabula-py or camelot for structured table extraction
145
+
146
+ ### Word Documents
147
+ - **python-docx** — Read `.docx` files; extract paragraphs, tables, headers
148
+ - **Older formats** — Handle `.doc` files with antiword or textract
149
+
150
+ ### Encoding
151
+ - **Character detection** — Use `chardet` to detect file encoding when unknown
152
+ - **UTF-8 normalization** — Convert all text to UTF-8; handle BOM (Byte Order Mark)
153
+ - **HTML encoding** — Read `<meta charset>` tag; handle entity references (`&amp;`, `&lt;`)
154
+
155
+ ---
156
+
157
+ ## Chapter 8: Cleaning Dirty Data
158
+
159
+ ### String Normalization
160
+ - **Whitespace** — Strip leading/trailing whitespace; normalize internal whitespace (multiple spaces to one)
161
+ - **Unicode normalization** — Use `unicodedata.normalize('NFKD', text)` for consistent Unicode representation
162
+ - **Case normalization** — Lowercase for comparison; preserve original for display
163
+
164
+ ### Regex Cleaning
165
+ - **Pattern extraction** — Use regex groups to extract structured data from messy text (prices, dates, phone numbers)
166
+ - **Substitution** — `re.sub()` to remove or replace unwanted characters and patterns
167
+ - **Compiled patterns** — Pre-compile frequently used patterns with `re.compile()` for performance
168
+
169
+ ### Data Normalization
170
+ - **Date formats** — Parse various date formats with `dateutil.parser`; store in ISO 8601
171
+ - **Number formats** — Handle commas, currency symbols, percentage signs; convert to numeric types
172
+ - **Address normalization** — Standardize address components; handle abbreviations
173
+
174
+ ### OpenRefine
175
+ - **Faceting** — Group similar values to find inconsistencies
176
+ - **Clustering** — Automatically find and merge similar values (fingerprint, n-gram, etc.)
177
+ - **GREL expressions** — Transform data with OpenRefine's expression language
178
+
179
+ ---
180
+
181
+ ## Chapter 9: Natural Language Processing
182
+
183
+ ### Text Analysis
184
+ - **N-grams** — Extract sequences of N words; useful for finding common phrases and patterns
185
+ - **Frequency analysis** — Count word/phrase frequencies; identify key topics in scraped text
186
+ - **Stop words** — Filter common words (the, is, at) to focus on meaningful content
187
+
188
+ ### Markov Models
189
+ - **Text generation** — Build Markov chains from scraped text; generate similar-style text
190
+ - **Chain order** — Higher order (2-gram, 3-gram) produces more coherent but less varied output
191
+
192
+ ### NLTK
193
+ - **Tokenization** — Split text into words and sentences with NLTK tokenizers
194
+ - **Part-of-speech tagging** — Tag words as nouns, verbs, etc. for structured extraction
195
+ - **Named entity recognition** — Extract names, organizations, locations from text
196
+ - **Stemming/lemmatization** — Reduce words to base forms for better matching and analysis
197
+
198
+ ---
199
+
200
+ ## Chapter 10: Crawling Through Forms and Logins
201
+
202
+ ### Form Submission
203
+ - **POST requests** — `requests.post(url, data={'field': 'value'})` for form submission
204
+ - **CSRF tokens** — Extract hidden CSRF token from form HTML; include in POST data
205
+ - **Form fields** — Inspect form with browser DevTools; identify all required fields including hidden ones
206
+ - **File uploads** — Use `files` parameter in `requests.post()` for multipart form data
207
+
208
+ ### Session Management
209
+ - **requests.Session()** — Maintains cookies across requests; handles redirects; connection pooling
210
+ - **Cookie persistence** — Session object automatically stores and sends cookies
211
+ - **Login flow** — GET login page → extract CSRF → POST credentials → use session for authenticated pages
212
+
213
+ ### Authentication
214
+ - **HTTP Basic Auth** — `requests.get(url, auth=('user', 'pass'))` for Basic authentication
215
+ - **Token-based** — Extract auth token from login response; send in headers for subsequent requests
216
+ - **OAuth** — Use `requests-oauthlib` for OAuth-protected APIs
217
+ - **Session expiry** — Detect expired sessions (redirects to login); re-authenticate automatically
218
+
219
+ ---
220
+
221
+ ## Chapter 11: Scraping JavaScript
222
+
223
+ ### Selenium WebDriver
224
+ - **Setup** — `webdriver.Chrome()` or `webdriver.Firefox()`; requires matching driver binary
225
+ - **Headless mode** — `options.add_argument('--headless')` for browser without GUI; essential for servers
226
+ - **Navigation** — `driver.get(url)`; `driver.find_element(By.CSS_SELECTOR, selector)`
227
+ - **Interaction** — `.click()`, `.send_keys()`, `.clear()` on elements; simulate user behavior
228
+
229
+ ### Waiting for Content
230
+ - **Implicit waits** — `driver.implicitly_wait(10)` sets default wait for element finding
231
+ - **Explicit waits** — `WebDriverWait(driver, 10).until(EC.presence_of_element_located(...))` for specific conditions
232
+ - **Expected conditions** — `element_to_be_clickable`, `visibility_of_element_located`, `text_to_be_present_in_element`
233
+ - **Custom waits** — Write lambda conditions for complex wait scenarios
234
+
235
+ ### JavaScript Execution
236
+ - **Execute script** — `driver.execute_script('return document.title')` runs JS in page context
237
+ - **Scroll page** — `driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')` for infinite scroll
238
+ - **Extract data** — Execute JS to extract data from page variables, localStorage, or DOM
239
+
240
+ ### Ajax Handling
241
+ - **Wait for Ajax** — Wait for specific elements that load asynchronously
242
+ - **Network monitoring** — Intercept XHR requests to find underlying API endpoints
243
+ - **Alternative approach** — If you can identify the API endpoint, use `requests` directly instead of Selenium
244
+
245
+ ---
246
+
247
+ ## Chapter 12: Crawling Through APIs
248
+
249
+ ### REST API Basics
250
+ - **HTTP methods** — GET (read), POST (create), PUT (update), DELETE (remove)
251
+ - **JSON responses** — `response.json()` for parsing; handle nested objects and arrays
252
+ - **Headers** — Set `Accept: application/json`, `Authorization: Bearer token`
253
+ - **Query parameters** — `requests.get(url, params={'key': 'value'})` for clean URL building
254
+
255
+ ### Undocumented APIs
256
+ - **Browser DevTools** — Use Network tab to discover API calls made by JavaScript
257
+ - **XHR filtering** — Filter network requests to XHR/Fetch to find data endpoints
258
+ - **Request replication** — Copy request headers, cookies, parameters from DevTools to Python
259
+ - **API reverse engineering** — Study request patterns to understand pagination, filtering, authentication
260
+
261
+ ### API Best Practices
262
+ - **Rate limiting** — Respect rate limit headers; implement backoff on 429 responses
263
+ - **Pagination** — Handle cursor-based, offset-based, and link-header pagination
264
+ - **Error handling** — Retry on 5xx errors with exponential backoff; don't retry on 4xx
265
+ - **Authentication** — Store API keys securely; handle token refresh for OAuth
266
+
267
+ ---
268
+
269
+ ## Chapter 13: Image Processing and OCR
270
+
271
+ ### Pillow (PIL)
272
+ - **Image loading** — `Image.open(path)` or from URL response content
273
+ - **Manipulation** — Resize, crop, rotate, filter for preprocessing before OCR
274
+ - **Thresholding** — Convert to grayscale; apply threshold for clean black/white text
275
+
276
+ ### Tesseract OCR
277
+ - **pytesseract** — `pytesseract.image_to_string(image)` for text extraction from images
278
+ - **Preprocessing** — Clean images before OCR: denoise, deskew, threshold, resize
279
+ - **Language support** — Specify language with `lang='eng'`; install language packs as needed
280
+ - **Confidence** — Use `image_to_data()` for per-word confidence scores; filter low confidence
281
+
282
+ ### CAPTCHA Handling
283
+ - **Simple CAPTCHAs** — Preprocessing + OCR may solve simple text CAPTCHAs
284
+ - **Complex CAPTCHAs** — Consider CAPTCHA-solving services or rethink approach (use API instead)
285
+ - **Ethical note** — CAPTCHAs exist to prevent automated access; respect their purpose
286
+
287
+ ---
288
+
289
+ ## Chapter 14: Avoiding Scraping Traps
290
+
291
+ ### Headers and Identity
292
+ - **User-Agent** — Set a realistic browser User-Agent string; rotate for large-scale scraping
293
+ - **Accept headers** — Include Accept, Accept-Language, Accept-Encoding to mimic real browsers
294
+ - **Referer** — Set appropriate Referer header when navigating between pages
295
+ - **Cookie handling** — Accept and send cookies; use sessions for automatic management
296
+
297
+ ### Behavioral Patterns
298
+ - **Request timing** — Add random delays between requests (1-5 seconds); avoid perfectly regular intervals
299
+ - **Navigation patterns** — Don't jump straight to data pages; mimic human browsing (home → category → product)
300
+ - **Click patterns** — With Selenium, click through pages naturally rather than jumping directly to URLs
301
+
302
+ ### Honeypot Detection
303
+ - **Hidden links** — Check for CSS `display:none` or `visibility:hidden` links; avoid following them
304
+ - **Hidden form fields** — Pre-filled hidden fields may be traps; don't submit unexpected values
305
+ - **Link patterns** — Suspicious URL patterns or link text may indicate honeypots
306
+
307
+ ### IP and Session Management
308
+ - **Proxy rotation** — Rotate IP addresses for large-scale scraping; use proxy services
309
+ - **Session rotation** — Create new sessions periodically; don't use same cookies indefinitely
310
+ - **Fingerprint diversity** — Vary headers, timing, and behavior to avoid fingerprinting
311
+
312
+ ---
313
+
314
+ ## Chapter 15: Testing Scrapers
315
+
316
+ ### Unit Testing
317
+ - **Parse function tests** — Test parsing functions with saved HTML files; verify extracted data
318
+ - **Fixture files** — Save representative HTML pages as test fixtures; don't hit live sites in tests
319
+ - **Edge cases** — Test with missing elements, empty pages, different layouts, malformed HTML
320
+
321
+ ### Integration Testing
322
+ - **End-to-end** — Test full scrape pipeline from fetch to storage with known target pages
323
+ - **Selenium tests** — Use Selenium for testing JavaScript-heavy scraping flows
324
+ - **Mock responses** — Use `responses` or `requests-mock` libraries for HTTP mocking in tests
325
+
326
+ ### Testing Best Practices
327
+ - **Site change detection** — Periodically check if site structure has changed; alert on selector failures
328
+ - **Regression testing** — Compare current results against known-good baselines
329
+ - **CI integration** — Run scraper tests in CI pipeline; catch issues before deployment
330
+
331
+ ---
332
+
333
+ ## Chapter 16: Parallel Web Scraping
334
+
335
+ ### Threading
336
+ - **threading module** — Use for I/O-bound scraping; GIL doesn't block network operations
337
+ - **Thread pool** — `concurrent.futures.ThreadPoolExecutor` for managed thread pools
338
+ - **Thread safety** — Use locks for shared state (counters, result lists); prefer queues for task distribution
339
+
340
+ ### Multiprocessing
341
+ - **multiprocessing module** — Use for CPU-bound processing (parsing, cleaning); bypasses GIL
342
+ - **Process pool** — `concurrent.futures.ProcessPoolExecutor` for managed process pools
343
+ - **Inter-process communication** — Use Queue for task distribution; Pipe for point-to-point
344
+
345
+ ### Queue-Based Architecture
346
+ - **Producer-consumer** — Producer adds URLs to queue; consumers fetch and parse in parallel
347
+ - **URL frontier** — Priority queue for managing which URLs to crawl next
348
+ - **Result aggregation** — Collect results from workers into shared storage
349
+
350
+ ### Parallel Best Practices
351
+ - **Per-domain limits** — Limit concurrent requests per domain even with parallel scraping
352
+ - **Graceful shutdown** — Handle KeyboardInterrupt; drain queues cleanly on shutdown
353
+ - **Error isolation** — One worker's failure shouldn't crash the entire scraping operation
354
+ - **Progress tracking** — Log completed/remaining tasks; monitor worker health
355
+
356
+ ---
357
+
358
+ ## Chapter 17: Remote Scraping
359
+
360
+ ### Tor
361
+ - **Tor proxy** — Route requests through Tor network for anonymity; `socks5://127.0.0.1:9150`
362
+ - **IP verification** — Check IP with a service like httpbin.org/ip to verify Tor is active
363
+ - **Performance** — Tor is slow; use only when anonymity is required
364
+ - **Circuit rotation** — Signal Tor to create new circuit for fresh IP; don't rotate too frequently
365
+
366
+ ### Proxy Services
367
+ - **Rotating proxies** — Commercial proxy services provide rotating IP pools
368
+ - **Proxy types** — HTTP/HTTPS proxies, SOCKS proxies; understand the difference
369
+ - **Proxy configuration** — `requests.get(url, proxies={'http': proxy_url})`; or configure in Scrapy settings
370
+
371
+ ### Cloud-Based Scraping
372
+ - **Headless instances** — Run scrapers on cloud VMs (AWS, GCP, DigitalOcean) for scale
373
+ - **Containerization** — Docker containers for consistent scraper environments
374
+ - **Scheduling** — Use cron, cloud schedulers, or orchestration tools for recurring scrapes
375
+ - **Cost management** — Right-size instances; use spot/preemptible instances for batch scraping
376
+
377
+ ---
378
+
379
+ ## Chapter 18: Legalities and Ethics
380
+
381
+ ### Legal Framework
382
+ - **robots.txt** — Machine-readable file at `/robots.txt`; specifies which paths are allowed/disallowed
383
+ - **Terms of Service** — Many sites prohibit scraping in ToS; understand the legal weight
384
+ - **CFAA** — Computer Fraud and Abuse Act (US); accessing computers "without authorization" is a federal crime
385
+ - **Copyright** — Scraped data may be copyrighted; fair use depends on purpose and amount
386
+ - **GDPR** — If scraping personal data of EU citizens, GDPR obligations apply
387
+
388
+ ### Ethical Scraping
389
+ - **Respect the site** — Don't overload servers; honor rate limits; scrape during off-peak hours
390
+ - **Identify yourself** — Use a descriptive User-Agent; provide contact email for site administrators
391
+ - **Minimize footprint** — Only scrape what you need; don't archive entire sites unnecessarily
392
+ - **Data handling** — Handle scraped personal data responsibly; minimize collection and storage
393
+ - **Give back** — If possible, contribute to the site or community; don't just extract value