wgit 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +28 -22
- data/TODO.txt +2 -1
- data/lib/wgit.rb +1 -0
- data/lib/wgit/crawler.rb +30 -9
- data/lib/wgit/document.rb +70 -153
- data/lib/wgit/document_extensions.rb +57 -0
- data/lib/wgit/url.rb +80 -24
- data/lib/wgit/utils.rb +36 -7
- data/lib/wgit/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8f6c1946b739327ba5b52a5541aa1496e2aea53c23e925bbb5e5fe7c063ddcc
|
4
|
+
data.tar.gz: 1bcf5dd5e41711758fdc737afba0707319de25cb524cfe08c14a50efb9a5f3e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad42cd8e392894a21a1c4497bed9d1efabbc8b2320825ab7a3fc7e0615157f28f262bcaa4a4910411ac7662b4b416848b97b68c264f2ce937be34cfa6a56a34c
|
7
|
+
data.tar.gz: fbaf8d6a48f996ce2c0b3cb4ee72a26cfede2368f3992e110c6f049e84b9ba6a2568aac0f192559ae10127db5d3cb0bda5dbbe508247ae5f918f3fc400528e75
|
data/README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# Wgit
|
2
2
|
|
3
|
-
Wgit is a Ruby gem similar in nature to GNU's `wget
|
3
|
+
Wgit is a Ruby gem similar in nature to GNU's `wget` tool. It provides an easy to use API for programmatic web scraping, indexing and searching.
|
4
4
|
|
5
|
-
Fundamentally, Wgit is a WWW indexer/scraper which crawls URL's, retrieves and serialises their page contents for later use. You can use Wgit to copy entire websites if required. Wgit also provides a means to search indexed documents stored in a database. Therefore, this library provides the main components of a WWW search engine. The Wgit API is easily extended allowing you to pull out the parts of a webpage that are important to you, the code snippets or
|
5
|
+
Fundamentally, Wgit is a WWW indexer/scraper which crawls URL's, retrieves and serialises their page contents for later use. You can use Wgit to copy entire websites if required. Wgit also provides a means to search indexed documents stored in a database. Therefore, this library provides the main components of a WWW search engine. The Wgit API is easily extended allowing you to pull out the parts of a webpage that are important to you, the code snippets or tables for example. As Wgit is a library, it has uses in many different application types.
|
6
6
|
|
7
|
-
Check out this [example application](https://search-engine-rb.herokuapp.com) - a search engine built using Wgit and Sinatra, deployed to Heroku.
|
7
|
+
Check out this [example application](https://search-engine-rb.herokuapp.com) - a search engine (see its [repository](https://github.com/michaeltelford/search_engine)) built using Wgit and Sinatra, deployed to Heroku. Heroku's free tier is used so the initial page load may be slow. Try searching for "Ruby" or something else that's Ruby related.
|
8
8
|
|
9
9
|
## Table Of Contents
|
10
10
|
|
@@ -51,20 +51,21 @@ doc = crawler.crawl url
|
|
51
51
|
doc.class # => Wgit::Document
|
52
52
|
doc.stats # => {
|
53
53
|
# :url=>44, :html=>28133, :title=>17, :keywords=>0,
|
54
|
-
# :links=>35, :
|
54
|
+
# :links=>35, :text_snippets=>67, :text_bytes=>13735
|
55
55
|
#}
|
56
56
|
|
57
57
|
# doc responds to the following methods:
|
58
58
|
Wgit::Document.instance_methods(false).sort # => [
|
59
|
-
# :==, :[], :author, :css, :date_crawled, :doc, :empty?, :external_links,
|
60
|
-
# :external_urls, :html, :internal_full_links, :internal_links,
|
61
|
-
# :
|
62
|
-
# :
|
63
|
-
# :to_h, :to_hash, :to_json, :url,
|
59
|
+
# :==, :[], :author, :css, :date_crawled, :doc, :empty?, :external_links,
|
60
|
+
# :external_urls, :html, :internal_full_links, :internal_links,
|
61
|
+
# :internal_links_without_anchors, :keywords, :links, :relative_full_links,
|
62
|
+
# :relative_full_urls, :relative_links, :relative_urls, :score, :search,
|
63
|
+
# :search!, :size, :stats, :text, :title, :to_h, :to_hash, :to_json, :url,
|
64
|
+
# :xpath
|
64
65
|
#]
|
65
66
|
|
66
67
|
results = doc.search "corruption"
|
67
|
-
results.first # => "ial materials involving war, spying and corruption.
|
68
|
+
results.first # => "ial materials involving war, spying and corruption.
|
68
69
|
# It has so far published more"
|
69
70
|
```
|
70
71
|
|
@@ -120,8 +121,8 @@ my_pages_keywords = ["Everest", "mountaineering school", "adventure"]
|
|
120
121
|
my_pages_missing_keywords = []
|
121
122
|
|
122
123
|
competitor_urls = [
|
123
|
-
"http://altitudejunkies.com",
|
124
|
-
"http://www.mountainmadness.com",
|
124
|
+
"http://altitudejunkies.com",
|
125
|
+
"http://www.mountainmadness.com",
|
125
126
|
"http://www.adventureconsultants.com"
|
126
127
|
]
|
127
128
|
|
@@ -188,7 +189,7 @@ require 'wgit/core_ext' # => Provides the String#to_url and Enumerable#to_urls m
|
|
188
189
|
# Here we create our own document rather than crawling the web.
|
189
190
|
# We pass the web page's URL and HTML Strings.
|
190
191
|
doc = Wgit::Document.new(
|
191
|
-
"http://test-url.com".to_url,
|
192
|
+
"http://test-url.com".to_url,
|
192
193
|
"<html><p>How now brown cow.</p><a href='http://www.google.co.uk'>Click me!</a></html>"
|
193
194
|
)
|
194
195
|
|
@@ -216,7 +217,7 @@ doc.search(query).first # => "How now brown cow."
|
|
216
217
|
|
217
218
|
db.insert doc.external_links
|
218
219
|
|
219
|
-
urls_to_crawl = db.uncrawled_urls # => Results will include doc.external_links.
|
220
|
+
urls_to_crawl = db.uncrawled_urls # => Results will include doc.external_links.
|
220
221
|
```
|
221
222
|
|
222
223
|
## Extending The API
|
@@ -247,7 +248,7 @@ Wgit::Document.text_elements << :a
|
|
247
248
|
|
248
249
|
# Our Document has a link whose's text we're interested in.
|
249
250
|
doc = Wgit::Document.new(
|
250
|
-
"http://some_url.com".to_url,
|
251
|
+
"http://some_url.com".to_url,
|
251
252
|
"<html><p>Hello world!</p>\
|
252
253
|
<a href='https://made-up-link.com'>Click this link.</a></html>"
|
253
254
|
)
|
@@ -258,13 +259,13 @@ doc.text # => ["Hello world!", "Click this link."]
|
|
258
259
|
|
259
260
|
**Note**: This only works for textual page content. For more control over the indexed elements themselves, see below.
|
260
261
|
|
261
|
-
### 2. Defining Custom Indexers
|
262
|
+
### 2. Defining Custom Indexers Via Document Extensions
|
262
263
|
|
263
264
|
If you want full control over the elements being indexed for your own purposes, then you can define a custom indexer for each type of element that you're interested in.
|
264
265
|
|
265
266
|
Once you have the indexed page element, accessed via a `Wgit::Document` instance method, you can do with it as you wish e.g. obtain it's text value or manipulate the element etc. Since the returned types are plain [Nokogiri](https://www.rubydoc.info/github/sparklemotion/nokogiri) objects, you have the full control that the Nokogiri gem gives you.
|
266
267
|
|
267
|
-
Here's how to add a
|
268
|
+
Here's how to add a Document extension to index a specific page element:
|
268
269
|
|
269
270
|
```ruby
|
270
271
|
require 'wgit'
|
@@ -283,7 +284,7 @@ end
|
|
283
284
|
|
284
285
|
# Our Document has a table which we're interested in.
|
285
286
|
doc = Wgit::Document.new(
|
286
|
-
"http://some_url.com".to_url,
|
287
|
+
"http://some_url.com".to_url,
|
287
288
|
"<html><p>Hello world!</p>\
|
288
289
|
<table><th>Header Text</th><th>Another Header</th></table></html>"
|
289
290
|
)
|
@@ -296,16 +297,19 @@ tables.class # => Nokogiri::XML::NodeSet
|
|
296
297
|
tables.first.class # => Nokogiri::XML::Element
|
297
298
|
```
|
298
299
|
|
300
|
+
**Note**: Wgit uses Document extensions to provide much of it's core functionality, providing access to a webpages text or links for example. These [default Document extensions](https://github.com/michaeltelford/wgit/blob/master/lib/wgit/document_extensions.rb) provide examples for your own.
|
301
|
+
|
299
302
|
**Extension Notes**:
|
300
303
|
|
301
|
-
- Any links should be mapped into `Wgit::Url` objects; Url's are treated as Strings when being inserted into the database.
|
302
|
-
- Any object (like a Nokogiri object) will not be inserted into the database,
|
304
|
+
- Any page links should be mapped into `Wgit::Url` objects; Url's are treated as Strings when being inserted into the database.
|
305
|
+
- Any object (like a Nokogiri object) will not be inserted into the database, it's up to you to map each object into a primitive type e.g. `Boolean, Array` etc.
|
303
306
|
|
304
307
|
## Caveats
|
305
308
|
|
306
309
|
Below are some points to keep in mind when using Wgit:
|
307
310
|
|
308
|
-
- All Url's must be prefixed with an appropiate protocol e.g. `https://`
|
311
|
+
- All absolute `Wgit::Url`'s must be prefixed with an appropiate protocol e.g. `https://`
|
312
|
+
- By default, up to 5 URL redirects will be followed; this is configurable however.
|
309
313
|
|
310
314
|
## Executable
|
311
315
|
|
@@ -317,11 +321,13 @@ This executable will be very similar in nature to `./bin/console` which is curre
|
|
317
321
|
|
318
322
|
## Development
|
319
323
|
|
324
|
+
The current road map is rudimentally listed in the [TODO.txt](https://github.com/michaeltelford/wgit/blob/master/TODO.txt) file.
|
325
|
+
|
320
326
|
For a full list of available Rake tasks, run `bundle exec rake help`. The most commonly used tasks are listed below...
|
321
327
|
|
322
328
|
After checking out the repo, run `./bin/setup` to install dependencies (requires `bundler`). Then, run `bundle exec rake test` to run the tests. You can also run `./bin/console` for an interactive REPL that will allow you to experiment with the code.
|
323
329
|
|
324
|
-
To generate code documentation run `bundle exec
|
330
|
+
To generate code documentation run `bundle exec yard doc`. To browse the generated documentation run `bundle exec yard server -r`.
|
325
331
|
|
326
332
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, see the *Gem Publishing Checklist* section of the `TODO.txt` file.
|
327
333
|
|
data/TODO.txt
CHANGED
@@ -8,7 +8,8 @@ Primary
|
|
8
8
|
|
9
9
|
Secondary
|
10
10
|
---------
|
11
|
-
- Setup a dedicated mLab account for the example application in the README - the Heroku deployed search engine; then index some ruby sites like ruby.org
|
11
|
+
- Setup a dedicated mLab account for the example application in the README - the Heroku deployed search engine; then index some ruby sites like ruby.org etc.
|
12
|
+
- Think about how we handle invalid url's on crawled documents. Setup tests and implement logic for this scenario.
|
12
13
|
- Think about ignoring non html documents/urls e.g. http://server/image.jpg etc. by implementing MIME types (defaulting to only HTML).
|
13
14
|
- Check if Document::TEXT_ELEMENTS is expansive enough.
|
14
15
|
- Possibly use refine instead of core-ext?
|
data/lib/wgit.rb
CHANGED
@@ -4,6 +4,7 @@ require_relative 'wgit/assertable'
|
|
4
4
|
require_relative 'wgit/utils'
|
5
5
|
require_relative 'wgit/url'
|
6
6
|
require_relative 'wgit/document'
|
7
|
+
require_relative 'wgit/document_extensions'
|
7
8
|
require_relative 'wgit/crawler'
|
8
9
|
require_relative 'wgit/database/connection_details'
|
9
10
|
require_relative 'wgit/database/model'
|
data/lib/wgit/crawler.rb
CHANGED
@@ -2,7 +2,7 @@ require_relative 'url'
|
|
2
2
|
require_relative 'document'
|
3
3
|
require_relative 'utils'
|
4
4
|
require_relative 'assertable'
|
5
|
-
require 'net/http' #
|
5
|
+
require 'net/http' # Requires 'uri'.
|
6
6
|
|
7
7
|
module Wgit
|
8
8
|
|
@@ -11,6 +11,15 @@ module Wgit
|
|
11
11
|
class Crawler
|
12
12
|
include Assertable
|
13
13
|
|
14
|
+
# The default maximum amount of allowed URL redirects.
|
15
|
+
@default_redirect_limit = 5
|
16
|
+
|
17
|
+
class << self
|
18
|
+
# Class level instance accessor methods for @default_redirect_limit.
|
19
|
+
# Call using Wgit::Crawler.default_redirect_limit etc.
|
20
|
+
attr_accessor :default_redirect_limit
|
21
|
+
end
|
22
|
+
|
14
23
|
# The urls to crawl.
|
15
24
|
attr_reader :urls
|
16
25
|
|
@@ -67,7 +76,7 @@ module Wgit
|
|
67
76
|
# Crawls individual urls, not entire sites.
|
68
77
|
#
|
69
78
|
# @param urls [Array<Wgit::Url>] The URLs to crawl.
|
70
|
-
# @yield [
|
79
|
+
# @yield [Wgit::Document] If provided, the block is given each crawled
|
71
80
|
# Document. Otherwise each doc is added to @docs which can be accessed
|
72
81
|
# by Crawler#docs after this method returns.
|
73
82
|
# @return [Wgit::Document] The last Document crawled.
|
@@ -82,7 +91,7 @@ module Wgit
|
|
82
91
|
# Crawl the url and return the response document or nil.
|
83
92
|
#
|
84
93
|
# @param url [Wgit::Document] The URL to crawl.
|
85
|
-
# @yield [
|
94
|
+
# @yield [Wgit::Document] The crawled HTML Document regardless if the
|
86
95
|
# crawl was successful or not. Therefore, the Document#url can be used.
|
87
96
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
88
97
|
# crawl was unsuccessful.
|
@@ -95,10 +104,11 @@ module Wgit
|
|
95
104
|
doc.empty? ? nil : doc
|
96
105
|
end
|
97
106
|
|
98
|
-
# Crawls an entire
|
107
|
+
# Crawls an entire website's HTML pages by recursively going through
|
108
|
+
# its internal links. Each crawled web Document is yielded to a block.
|
99
109
|
#
|
100
110
|
# @param base_url [Wgit::Url] The base URL of the website to be crawled.
|
101
|
-
# @yield [
|
111
|
+
# @yield [Wgit::Document] Given each crawled Document/page of the site.
|
102
112
|
# A block is the only way to interact with each crawled Document.
|
103
113
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
104
114
|
# from all of the site's pages or nil if the base_url could not be
|
@@ -112,7 +122,7 @@ module Wgit
|
|
112
122
|
path = base_url.path.nil? ? '/' : base_url.path
|
113
123
|
crawled_urls = [path]
|
114
124
|
external_urls = doc.external_links
|
115
|
-
internal_urls = doc
|
125
|
+
internal_urls = get_internal_links(doc)
|
116
126
|
|
117
127
|
return doc.external_links.uniq if internal_urls.empty?
|
118
128
|
|
@@ -126,7 +136,7 @@ module Wgit
|
|
126
136
|
doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
|
127
137
|
crawled_urls << link
|
128
138
|
next if doc.nil?
|
129
|
-
internal_urls.concat(doc
|
139
|
+
internal_urls.concat(get_internal_links(doc))
|
130
140
|
external_urls.concat(doc.external_links)
|
131
141
|
end
|
132
142
|
end
|
@@ -158,14 +168,15 @@ module Wgit
|
|
158
168
|
Wgit.logger.debug(
|
159
169
|
"Wgit::Crawler#fetch('#{url}') exception: #{ex.message}"
|
160
170
|
)
|
171
|
+
@last_response = nil
|
161
172
|
nil
|
162
173
|
end
|
163
174
|
|
164
175
|
# The resolve method performs a HTTP GET to obtain the HTML document.
|
165
176
|
# A certain amount of redirects will be followed by default before raising
|
166
|
-
# an exception. Redirects can be disabled by setting `redirect_limit:
|
177
|
+
# an exception. Redirects can be disabled by setting `redirect_limit: 0`.
|
167
178
|
# The Net::HTTPResponse will be returned.
|
168
|
-
def resolve(url, redirect_limit:
|
179
|
+
def resolve(url, redirect_limit: Wgit::Crawler.default_redirect_limit)
|
169
180
|
redirect_count = -1
|
170
181
|
begin
|
171
182
|
raise "Too many redirects" if redirect_count >= redirect_limit
|
@@ -186,6 +197,16 @@ module Wgit
|
|
186
197
|
@urls << Wgit::Url.new(url)
|
187
198
|
end
|
188
199
|
|
200
|
+
# Pull out the doc's internal HTML page links for crawling.
|
201
|
+
def get_internal_links(doc)
|
202
|
+
doc.
|
203
|
+
internal_links_without_anchors.
|
204
|
+
reject do |link|
|
205
|
+
ext = link.to_extension
|
206
|
+
ext ? !['htm', 'html'].include?(ext) : false
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
189
210
|
alias :crawl :crawl_urls
|
190
211
|
alias :crawl_r :crawl_site
|
191
212
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -19,11 +19,17 @@ module Wgit
|
|
19
19
|
# The HTML elements that make up the visible text on a page.
|
20
20
|
# These elements are used to initialize the @text of the Document.
|
21
21
|
# See the README.md for how to add to this Array dynamically.
|
22
|
-
|
22
|
+
@text_elements = [
|
23
23
|
:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
24
24
|
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
|
25
25
|
]
|
26
26
|
|
27
|
+
class << self
|
28
|
+
# Class level instance reader method for @text_elements.
|
29
|
+
# Call using Wgit::Document.text_elements.
|
30
|
+
attr_reader :text_elements
|
31
|
+
end
|
32
|
+
|
27
33
|
# The URL of the webpage, an instance of Wgit::Url.
|
28
34
|
attr_reader :url
|
29
35
|
|
@@ -157,7 +163,7 @@ module Wgit
|
|
157
163
|
if var == :@text
|
158
164
|
count = 0
|
159
165
|
@text.each { |t| count += t.length }
|
160
|
-
hash[:
|
166
|
+
hash[:text_snippets] = @text.length
|
161
167
|
hash[:text_bytes] = count
|
162
168
|
# Else take the var's #length method return value.
|
163
169
|
else
|
@@ -202,8 +208,12 @@ module Wgit
|
|
202
208
|
@doc.css(selector)
|
203
209
|
end
|
204
210
|
|
205
|
-
# Get all internal links of this Document in relative form. Internal
|
206
|
-
# meaning a link to another
|
211
|
+
# Get all the internal links of this Document in relative form. Internal
|
212
|
+
# meaning a link to another document on this domain. This Document's domain
|
213
|
+
# is used to determine if an absolute URL is actually a relative link e.g.
|
214
|
+
# For a Document representing http://server.com/about, an absolute link of
|
215
|
+
# <a href='http://server.com/search'> will be recognized and returned as an
|
216
|
+
# internal link because both Documents live on the same domain. Also see
|
207
217
|
# Wgit::Document#internal_full_links.
|
208
218
|
#
|
209
219
|
# @return [Array<Wgit::Url>] self's internal/relative URL's.
|
@@ -216,12 +226,28 @@ module Wgit
|
|
216
226
|
rescue
|
217
227
|
true
|
218
228
|
end.
|
219
|
-
map(&:
|
229
|
+
map(&:without_base).
|
230
|
+
map do |link| # We map @url.to_host into / because it's a duplicate.
|
231
|
+
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
232
|
+
end
|
220
233
|
|
221
|
-
process_arr(links)
|
234
|
+
Wgit::Utils.process_arr(links)
|
222
235
|
end
|
223
236
|
|
224
|
-
# Get all internal links of this Document
|
237
|
+
# Get all the internal links of this Document with their anchors removed
|
238
|
+
# (if present). Also see Wgit::Document#internal_links.
|
239
|
+
#
|
240
|
+
# @return [Array<Wgit::Url>] self's internal/relative URL's with their
|
241
|
+
# anchors removed.
|
242
|
+
def internal_links_without_anchors
|
243
|
+
in_links = internal_links
|
244
|
+
return [] if in_links.empty?
|
245
|
+
in_links.
|
246
|
+
map(&:without_anchor).
|
247
|
+
reject(&:empty?)
|
248
|
+
end
|
249
|
+
|
250
|
+
# Get all the internal links of this Document and append them to this
|
225
251
|
# Document's base URL making them absolute. Also see
|
226
252
|
# Wgit::Document#internal_links.
|
227
253
|
#
|
@@ -233,8 +259,8 @@ module Wgit
|
|
233
259
|
in_links.map { |link| @url.to_base.concat(link) }
|
234
260
|
end
|
235
261
|
|
236
|
-
# Get all external links of this Document. External meaning a link to
|
237
|
-
#
|
262
|
+
# Get all the external links of this Document. External meaning a link to
|
263
|
+
# a different domain.
|
238
264
|
#
|
239
265
|
# @return [Array<Wgit::Url>] self's external/absolute URL's.
|
240
266
|
def external_links
|
@@ -248,7 +274,7 @@ module Wgit
|
|
248
274
|
end.
|
249
275
|
map(&:without_trailing_slash)
|
250
276
|
|
251
|
-
process_arr(links)
|
277
|
+
Wgit::Utils.process_arr(links)
|
252
278
|
end
|
253
279
|
|
254
280
|
# Searches against the @text for the given search query.
|
@@ -305,12 +331,19 @@ module Wgit
|
|
305
331
|
|
306
332
|
### Document (Class) methods ###
|
307
333
|
|
308
|
-
#
|
334
|
+
# Uses Document.text_elements to build an xpath String, used to obtain
|
335
|
+
# all of the combined text on a webpage.
|
309
336
|
#
|
310
|
-
# @return [
|
311
|
-
|
312
|
-
|
313
|
-
|
337
|
+
# @return [String] An xpath String to obtain a webpage's text elements.
|
338
|
+
def self.text_elements_xpath
|
339
|
+
xpath = ""
|
340
|
+
return xpath if Wgit::Document.text_elements.empty?
|
341
|
+
el_xpath = "//%s/text()"
|
342
|
+
Wgit::Document.text_elements.each_with_index do |el, i|
|
343
|
+
xpath += " | " unless i == 0
|
344
|
+
xpath += el_xpath % [el]
|
345
|
+
end
|
346
|
+
xpath
|
314
347
|
end
|
315
348
|
|
316
349
|
# Initialises a private instance variable with the xpath or database object
|
@@ -326,7 +359,11 @@ module Wgit
|
|
326
359
|
# effectively implements ORM like behavior using this class.
|
327
360
|
#
|
328
361
|
# @param var [Symbol] The name of the variable to be initialised.
|
329
|
-
# @param xpath [String]
|
362
|
+
# @param xpath [String, Object#call] The xpath used to find the element(s)
|
363
|
+
# of the webpage. Pass a callable object (proc etc.) if you want the
|
364
|
+
# xpath value to be derived on Document initialisation (instead of when
|
365
|
+
# the extension is defined). The call method must return a valid xpath
|
366
|
+
# String.
|
330
367
|
# @option options [Boolean] :singleton The singleton option determines
|
331
368
|
# whether or not the result(s) should be in an Array. If multiple
|
332
369
|
# results are found and singleton is true then the first result will be
|
@@ -334,10 +371,13 @@ module Wgit
|
|
334
371
|
# @option options [Boolean] :text_content_only The text_content_only option
|
335
372
|
# if true will use the text content of the Nokogiri result object,
|
336
373
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
337
|
-
# @yield [
|
374
|
+
# @yield [Object, Symbol] Yields the value about to be assigned to the new
|
375
|
+
# var and the source of the value (either :html or :object aka database).
|
338
376
|
# The return value of the block becomes the new var value, unless nil.
|
339
|
-
# Return nil if you want to inspect but not change the var value.
|
340
|
-
#
|
377
|
+
# Return nil if you want to inspect but not change the var value. The
|
378
|
+
# block gets executed when a Document is initialized from html or an
|
379
|
+
# object.
|
380
|
+
# @return [Symbol] The first half of the newly defined method names e.g.
|
341
381
|
# if var == "title" then :init_title is returned.
|
342
382
|
def self.define_extension(var, xpath, options = {}, &block)
|
343
383
|
default_options = { singleton: true, text_content_only: true }
|
@@ -389,6 +429,12 @@ module Wgit
|
|
389
429
|
end
|
390
430
|
end
|
391
431
|
|
432
|
+
# Ensure the @url and @html Strings are correctly encoded etc.
|
433
|
+
def process_url_and_html
|
434
|
+
@url = Wgit::Utils.process_str(@url)
|
435
|
+
@html = Wgit::Utils.process_str(@html)
|
436
|
+
end
|
437
|
+
|
392
438
|
# Returns an object/value from this Document's @html using the provided
|
393
439
|
# xpath param.
|
394
440
|
# singleton ? results.first (single Object) : results (Array)
|
@@ -396,6 +442,7 @@ module Wgit
|
|
396
442
|
# A block can be used to set the final value before it is returned.
|
397
443
|
# Return nil from the block if you don't want to override the value.
|
398
444
|
def find_in_html(xpath, singleton: true, text_content_only: true)
|
445
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
399
446
|
results = @doc.xpath(xpath)
|
400
447
|
|
401
448
|
if results and not results.empty?
|
@@ -408,10 +455,10 @@ module Wgit
|
|
408
455
|
result = singleton ? nil : []
|
409
456
|
end
|
410
457
|
|
411
|
-
singleton ? process_str(result) : process_arr(result)
|
458
|
+
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
412
459
|
|
413
460
|
if block_given?
|
414
|
-
new_result = yield(result)
|
461
|
+
new_result = yield(result, :html)
|
415
462
|
result = new_result if new_result
|
416
463
|
end
|
417
464
|
|
@@ -427,10 +474,10 @@ module Wgit
|
|
427
474
|
|
428
475
|
default = singleton ? nil : []
|
429
476
|
result = obj.fetch(key.to_s, default)
|
430
|
-
singleton ? process_str(result) : process_arr(result)
|
477
|
+
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
431
478
|
|
432
479
|
if block_given?
|
433
|
-
new_result = yield(result)
|
480
|
+
new_result = yield(result, :object)
|
434
481
|
result = new_result if new_result
|
435
482
|
end
|
436
483
|
|
@@ -454,136 +501,6 @@ module Wgit
|
|
454
501
|
end
|
455
502
|
end
|
456
503
|
|
457
|
-
# Takes Docuent.text_elements and returns an xpath String used to obtain
|
458
|
-
# all of the combined text.
|
459
|
-
def text_elements_xpath
|
460
|
-
xpath = ""
|
461
|
-
return xpath if @@text_elements.empty?
|
462
|
-
el_xpath = "//%s/text()"
|
463
|
-
@@text_elements.each_with_index do |el, i|
|
464
|
-
xpath += " | " unless i == 0
|
465
|
-
xpath += el_xpath % [el]
|
466
|
-
end
|
467
|
-
xpath
|
468
|
-
end
|
469
|
-
|
470
|
-
# Processes a String to make it uniform.
|
471
|
-
def process_str(str)
|
472
|
-
if str.is_a?(String)
|
473
|
-
str.encode!('UTF-8', 'UTF-8', invalid: :replace)
|
474
|
-
str.strip!
|
475
|
-
end
|
476
|
-
str
|
477
|
-
end
|
478
|
-
|
479
|
-
# Processes an Array to make it uniform.
|
480
|
-
def process_arr(array)
|
481
|
-
if array.is_a?(Array)
|
482
|
-
array.map! { |str| process_str(str) }
|
483
|
-
array.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
484
|
-
array.compact!
|
485
|
-
array.uniq!
|
486
|
-
end
|
487
|
-
array
|
488
|
-
end
|
489
|
-
|
490
|
-
# Ensure the @url and @html Strings are correctly encoded etc.
|
491
|
-
def process_url_and_html
|
492
|
-
@url = process_str(@url)
|
493
|
-
@html = process_str(@html)
|
494
|
-
end
|
495
|
-
|
496
|
-
### Default init_* (Document extension) methods. ###
|
497
|
-
|
498
|
-
# Init methods for title.
|
499
|
-
|
500
|
-
def init_title_from_html
|
501
|
-
xpath = "//title"
|
502
|
-
result = find_in_html(xpath)
|
503
|
-
init_var(:@title, result)
|
504
|
-
end
|
505
|
-
|
506
|
-
def init_title_from_object(obj)
|
507
|
-
result = find_in_object(obj, "title")
|
508
|
-
init_var(:@title, result)
|
509
|
-
end
|
510
|
-
|
511
|
-
# Init methods for author.
|
512
|
-
|
513
|
-
def init_author_from_html
|
514
|
-
xpath = "//meta[@name='author']/@content"
|
515
|
-
result = find_in_html(xpath)
|
516
|
-
init_var(:@author, result)
|
517
|
-
end
|
518
|
-
|
519
|
-
def init_author_from_object(obj)
|
520
|
-
result = find_in_object(obj, "author")
|
521
|
-
init_var(:@author, result)
|
522
|
-
end
|
523
|
-
|
524
|
-
# Init methods for keywords.
|
525
|
-
|
526
|
-
def init_keywords_from_html
|
527
|
-
xpath = "//meta[@name='keywords']/@content"
|
528
|
-
result = find_in_html(xpath) do |keywords|
|
529
|
-
if keywords
|
530
|
-
keywords = keywords.split(",")
|
531
|
-
process_arr(keywords)
|
532
|
-
end
|
533
|
-
keywords
|
534
|
-
end
|
535
|
-
init_var(:@keywords, result)
|
536
|
-
end
|
537
|
-
|
538
|
-
def init_keywords_from_object(obj)
|
539
|
-
result = find_in_object(obj, "keywords", singleton: false)
|
540
|
-
init_var(:@keywords, result)
|
541
|
-
end
|
542
|
-
|
543
|
-
# Init methods for links.
|
544
|
-
|
545
|
-
def init_links_from_html
|
546
|
-
# Any element with a href or src attribute is considered a link.
|
547
|
-
xpath = '//*/@href | //*/@src'
|
548
|
-
result = find_in_html(xpath, singleton: false) do |links|
|
549
|
-
if links
|
550
|
-
links.map! do |link|
|
551
|
-
begin
|
552
|
-
Wgit::Url.new(link)
|
553
|
-
rescue
|
554
|
-
nil
|
555
|
-
end
|
556
|
-
end
|
557
|
-
links.compact!
|
558
|
-
end
|
559
|
-
links
|
560
|
-
end
|
561
|
-
init_var(:@links, result)
|
562
|
-
end
|
563
|
-
|
564
|
-
def init_links_from_object(obj)
|
565
|
-
result = find_in_object(obj, "links", singleton: false) do |links|
|
566
|
-
if links
|
567
|
-
links.map! { |link| Wgit::Url.new(link) }
|
568
|
-
end
|
569
|
-
links
|
570
|
-
end
|
571
|
-
init_var(:@links, result)
|
572
|
-
end
|
573
|
-
|
574
|
-
# Init methods for text.
|
575
|
-
|
576
|
-
def init_text_from_html
|
577
|
-
xpath = text_elements_xpath
|
578
|
-
result = find_in_html(xpath, singleton: false)
|
579
|
-
init_var(:@text, result)
|
580
|
-
end
|
581
|
-
|
582
|
-
def init_text_from_object(obj)
|
583
|
-
result = find_in_object(obj, "text", singleton: false)
|
584
|
-
init_var(:@text, result)
|
585
|
-
end
|
586
|
-
|
587
504
|
alias :to_hash :to_h
|
588
505
|
alias :relative_links :internal_links
|
589
506
|
alias :relative_urls :internal_links
|
@@ -0,0 +1,57 @@
|
|
1
|
+
### Default Document Extensions ###
|
2
|
+
|
3
|
+
# Title.
|
4
|
+
Wgit::Document.define_extension(
|
5
|
+
:title,
|
6
|
+
'//title',
|
7
|
+
singleton: true,
|
8
|
+
text_content_only: true,
|
9
|
+
)
|
10
|
+
|
11
|
+
# Author.
|
12
|
+
Wgit::Document.define_extension(
|
13
|
+
:author,
|
14
|
+
'//meta[@name="author"]/@content',
|
15
|
+
singleton: true,
|
16
|
+
text_content_only: true,
|
17
|
+
)
|
18
|
+
|
19
|
+
# Keywords.
|
20
|
+
Wgit::Document.define_extension(
|
21
|
+
:keywords,
|
22
|
+
'//meta[@name="keywords"]/@content',
|
23
|
+
singleton: true,
|
24
|
+
text_content_only: true,
|
25
|
+
) do |keywords, source|
|
26
|
+
if keywords and source == :html
|
27
|
+
keywords = keywords.split(',')
|
28
|
+
Wgit::Utils.process_arr(keywords)
|
29
|
+
end
|
30
|
+
keywords
|
31
|
+
end
|
32
|
+
|
33
|
+
# Links.
|
34
|
+
Wgit::Document.define_extension(
|
35
|
+
:links,
|
36
|
+
'//a/@href',
|
37
|
+
singleton: false,
|
38
|
+
text_content_only: true,
|
39
|
+
) do |links|
|
40
|
+
if links
|
41
|
+
links.map! do |link|
|
42
|
+
Wgit::Url.new(link)
|
43
|
+
rescue
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
links.compact!
|
47
|
+
end
|
48
|
+
links
|
49
|
+
end
|
50
|
+
|
51
|
+
# Text.
|
52
|
+
Wgit::Document.define_extension(
|
53
|
+
:text,
|
54
|
+
proc { Wgit::Document.text_elements_xpath },
|
55
|
+
singleton: false,
|
56
|
+
text_content_only: true,
|
57
|
+
)
|
data/lib/wgit/url.rb
CHANGED
@@ -107,15 +107,19 @@ module Wgit
|
|
107
107
|
# @raise [RuntimeError] If the link is invalid.
|
108
108
|
def self.relative_link?(link, base: nil)
|
109
109
|
raise "Invalid link: #{link}" if link.nil? or link.empty?
|
110
|
-
|
111
|
-
|
110
|
+
|
111
|
+
link = Wgit::Url.new(link)
|
112
|
+
if base
|
113
|
+
base = Wgit::Url.new(base)
|
114
|
+
if base.to_scheme.nil?
|
115
|
+
raise "Invalid base, must contain protocol prefix: #{base}"
|
116
|
+
end
|
112
117
|
end
|
113
118
|
|
114
|
-
|
115
|
-
if uri.relative?
|
119
|
+
if link.to_uri.relative?
|
116
120
|
true
|
117
121
|
else
|
118
|
-
base ?
|
122
|
+
base ? link.to_host == base.to_host : false
|
119
123
|
end
|
120
124
|
end
|
121
125
|
|
@@ -125,11 +129,10 @@ module Wgit
|
|
125
129
|
# @param link [Wgit::Url, String] The link to add to the host prefix.
|
126
130
|
# @return [Wgit::Url] host + "/" + link
|
127
131
|
def self.concat(host, link)
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
Wgit::Url.new(url + separator + link)
|
132
|
+
host = Wgit::Url.new(host).without_trailing_slash
|
133
|
+
link = Wgit::Url.new(link).without_leading_slash
|
134
|
+
separator = (link.start_with?('#') or link.start_with?('?')) ? '' : '/'
|
135
|
+
Wgit::Url.new(host + separator + link)
|
133
136
|
end
|
134
137
|
|
135
138
|
# Returns if self is a relative or absolute Url. If base is provided and
|
@@ -217,9 +220,9 @@ module Wgit
|
|
217
220
|
path = @uri.path
|
218
221
|
return nil if path.nil? or path.empty?
|
219
222
|
return Wgit::Url.new('/') if path == '/'
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
+
Wgit::Url.new(path).
|
224
|
+
without_leading_slash.
|
225
|
+
without_trailing_slash
|
223
226
|
end
|
224
227
|
|
225
228
|
# Returns the endpoint of this URL e.g. the bit after the host with any
|
@@ -253,26 +256,77 @@ module Wgit
|
|
253
256
|
anchor ? Wgit::Url.new("##{anchor}") : nil
|
254
257
|
end
|
255
258
|
|
256
|
-
# Returns a new Wgit::Url containing just the
|
257
|
-
#
|
259
|
+
# Returns a new Wgit::Url containing just the file extension of this URL
|
260
|
+
# e.g. Given http://google.com#about.html, html is returned.
|
258
261
|
#
|
259
|
-
# @return [Wgit::Url, nil] Containing just the
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
both.empty? ? nil : Wgit::Url.new(both)
|
262
|
+
# @return [Wgit::Url, nil] Containing just the extension string or nil.
|
263
|
+
def to_extension
|
264
|
+
path = to_path
|
265
|
+
return nil unless path
|
266
|
+
segs = path.split('.')
|
267
|
+
segs.length > 1 ? Wgit::Url.new(segs.last) : nil
|
266
268
|
end
|
267
269
|
|
268
270
|
# Returns a new Wgit::Url containing self without a trailing slash. Is
|
269
|
-
# idempotent
|
271
|
+
# idempotent meaning self will always be returned regardless of whether
|
272
|
+
# there's a trailing slash or not.
|
270
273
|
#
|
271
|
-
# @return [Wgit::Url]
|
274
|
+
# @return [Wgit::Url] Self without a trailing slash.
|
275
|
+
def without_leading_slash
|
276
|
+
start_with?('/') ? Wgit::Url.new(self[1..-1]) : self
|
277
|
+
end
|
278
|
+
|
279
|
+
# Returns a new Wgit::Url containing self without a trailing slash. Is
|
280
|
+
# idempotent meaning self will always be returned regardless of whether
|
281
|
+
# there's a trailing slash or not.
|
282
|
+
#
|
283
|
+
# @return [Wgit::Url] Self without a trailing slash.
|
272
284
|
def without_trailing_slash
|
273
285
|
end_with?('/') ? Wgit::Url.new(chop) : self
|
274
286
|
end
|
275
287
|
|
288
|
+
# Returns a new Wgit::Url containing self without a leading or trailing
|
289
|
+
# slash. Is idempotent and will return self regardless if there's slashes
|
290
|
+
# present or not.
|
291
|
+
#
|
292
|
+
# @return [Wgit::Url] Self without leading or trailing slashes.
|
293
|
+
def without_slashes
|
294
|
+
self.
|
295
|
+
without_leading_slash.
|
296
|
+
without_trailing_slash
|
297
|
+
end
|
298
|
+
|
299
|
+
# Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
|
300
|
+
# http://google.com/search?q=something#about, search?q=something#about is
|
301
|
+
# returned. If relative and base isn't present then self is returned.
|
302
|
+
# Leading and trailing slashes are always stripped from the return value.
|
303
|
+
#
|
304
|
+
# @return [Wgit::Url] Self containing everything after the base.
|
305
|
+
def without_base
|
306
|
+
base_url = to_base
|
307
|
+
without_base = base_url ? gsub(base_url, '') : self
|
308
|
+
|
309
|
+
return self if ['', '/'].include?(without_base)
|
310
|
+
Wgit::Url.new(without_base).
|
311
|
+
without_leading_slash.
|
312
|
+
without_trailing_slash
|
313
|
+
end
|
314
|
+
|
315
|
+
# Returns a new Wgit::Url with the anchor portion removed e.g. Given
|
316
|
+
# http://google.com/search#about, http://google.com/search is
|
317
|
+
# returned. Self is returned as is if no anchor is present. A URL
|
318
|
+
# consisting of only an anchor e.g. '#about' will return an empty URL.
|
319
|
+
# This method assumes that the anchor is correctly placed at the very end
|
320
|
+
# of the URL.
|
321
|
+
#
|
322
|
+
# @return [Wgit::Url] Self with the anchor portion removed.
|
323
|
+
def without_anchor
|
324
|
+
anchor = to_anchor
|
325
|
+
without_anchor = anchor ? gsub(anchor, '') : self
|
326
|
+
|
327
|
+
Wgit::Url.new(without_anchor)
|
328
|
+
end
|
329
|
+
|
276
330
|
# Returns a Hash containing this Url's instance vars excluding @uri.
|
277
331
|
# Used when storing the URL in a Database e.g. MongoDB etc.
|
278
332
|
#
|
@@ -298,6 +352,8 @@ module Wgit
|
|
298
352
|
alias :anchor :to_anchor
|
299
353
|
alias :to_fragment :to_anchor
|
300
354
|
alias :fragment :to_anchor
|
355
|
+
alias :extension :to_extension
|
356
|
+
alias :without_fragment :without_anchor
|
301
357
|
alias :internal_link? :relative_link?
|
302
358
|
alias :is_relative? :relative_link?
|
303
359
|
alias :is_internal? :relative_link?
|
data/lib/wgit/utils.rb
CHANGED
@@ -2,7 +2,7 @@ module Wgit
|
|
2
2
|
|
3
3
|
# Utility module containing generic methods.
|
4
4
|
module Utils
|
5
|
-
|
5
|
+
|
6
6
|
# Returns the current time stamp.
|
7
7
|
#
|
8
8
|
# @return [Time] The current time stamp.
|
@@ -27,7 +27,7 @@ module Wgit
|
|
27
27
|
end
|
28
28
|
hash
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
# Returns the model having removed non bson types (for use with MongoDB).
|
32
32
|
#
|
33
33
|
# @param model_hash [Hash] The model Hash to process.
|
@@ -60,7 +60,7 @@ module Wgit
|
|
60
60
|
# @param index [Integer] The first index of a word in sentence. This is
|
61
61
|
# usually a word in a search query.
|
62
62
|
# @param sentence_limit [Integer] The max length of the formatted sentence
|
63
|
-
# being returned. The length will be based on the sentence_limit
|
63
|
+
# being returned. The length will be based on the sentence_limit
|
64
64
|
# parameter or the full length of the original sentence, which ever
|
65
65
|
# is less. The full sentence is returned if the sentence_limit is 0.
|
66
66
|
# @return [String] The sentence once formatted.
|
@@ -70,7 +70,7 @@ module Wgit
|
|
70
70
|
if index < 0 or index > sentence.length
|
71
71
|
raise "Incorrect index value: #{index}"
|
72
72
|
end
|
73
|
-
|
73
|
+
|
74
74
|
return sentence if sentence_limit == 0
|
75
75
|
|
76
76
|
start = 0
|
@@ -131,11 +131,11 @@ module Wgit
|
|
131
131
|
# to output text somewhere e.g. STDOUT (the default).
|
132
132
|
# @return [nil]
|
133
133
|
def self.printf_search_results(results, query = nil, case_sensitive = false,
|
134
|
-
sentence_length = 80, keyword_count = 5,
|
134
|
+
sentence_length = 80, keyword_count = 5,
|
135
135
|
stream = Kernel)
|
136
136
|
raise "stream must respond_to? :puts" unless stream.respond_to? :puts
|
137
137
|
keyword_count -= 1 # Because Array's are zero indexed.
|
138
|
-
|
138
|
+
|
139
139
|
results.each do |doc|
|
140
140
|
sentence = if query.nil?
|
141
141
|
nil
|
@@ -155,8 +155,37 @@ module Wgit
|
|
155
155
|
stream.puts doc.url
|
156
156
|
stream.puts
|
157
157
|
end
|
158
|
-
|
158
|
+
|
159
159
|
nil
|
160
160
|
end
|
161
|
+
|
162
|
+
# Processes a String to make it uniform. Strips off any leading/trailing
|
163
|
+
# white space and converts to UTF-8.
|
164
|
+
#
|
165
|
+
# @param str [String] The String to process. str is modified.
|
166
|
+
# @return [String] The processed str is both modified and then returned.
|
167
|
+
def self.process_str(str)
|
168
|
+
if str.is_a?(String)
|
169
|
+
str.encode!('UTF-8', 'UTF-8', invalid: :replace)
|
170
|
+
str.strip!
|
171
|
+
end
|
172
|
+
str
|
173
|
+
end
|
174
|
+
|
175
|
+
# Processes an Array to make it uniform. Removes empty Strings and nils,
|
176
|
+
# processes non empty Strings using Wgit::Utils.process_str before removing
|
177
|
+
# duplicates.
|
178
|
+
#
|
179
|
+
# @param arr [Enumerable] The Array to process. arr is modified.
|
180
|
+
# @return [Enumerable] The processed arr is both modified and then returned.
|
181
|
+
def self.process_arr(arr)
|
182
|
+
if arr.is_a?(Array)
|
183
|
+
arr.map! { |str| process_str(str) }
|
184
|
+
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
185
|
+
arr.compact!
|
186
|
+
arr.uniq!
|
187
|
+
end
|
188
|
+
arr
|
189
|
+
end
|
161
190
|
end
|
162
191
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -169,7 +169,7 @@ description: Fundamentally, Wgit is a WWW indexer/scraper which crawls URL's, re
|
|
169
169
|
websites if required. Wgit also provides a means to search indexed documents stored
|
170
170
|
in a database. Therefore, this library provides the main components of a WWW search
|
171
171
|
engine. The Wgit API is easily extended allowing you to pull out the parts of a
|
172
|
-
webpage that are important to you, the code snippets or
|
172
|
+
webpage that are important to you, the code snippets or tables for example. As Wgit
|
173
173
|
is a library, it has uses in many different application types.
|
174
174
|
email: michael.telford@live.com
|
175
175
|
executables: []
|
@@ -184,6 +184,7 @@ files:
|
|
184
184
|
- "./lib/wgit/database/database.rb"
|
185
185
|
- "./lib/wgit/database/model.rb"
|
186
186
|
- "./lib/wgit/document.rb"
|
187
|
+
- "./lib/wgit/document_extensions.rb"
|
187
188
|
- "./lib/wgit/indexer.rb"
|
188
189
|
- "./lib/wgit/logger.rb"
|
189
190
|
- "./lib/wgit/url.rb"
|
@@ -218,6 +219,6 @@ rubyforge_project:
|
|
218
219
|
rubygems_version: 2.7.6
|
219
220
|
signing_key:
|
220
221
|
specification_version: 4
|
221
|
-
summary: Wgit is a Ruby gem similar in nature to GNU's `wget
|
222
|
-
to use API for programmatic web scraping, indexing and searching.
|
222
|
+
summary: Wgit is a Ruby gem similar in nature to GNU's `wget` tool. It provides an
|
223
|
+
easy to use API for programmatic web scraping, indexing and searching.
|
223
224
|
test_files: []
|