wgit 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +21 -0
- data/README.md +334 -0
- data/TODO.txt +35 -0
- data/lib/wgit/assertable.rb +4 -0
- data/lib/wgit/core_ext.rb +4 -2
- data/lib/wgit/crawler.rb +188 -188
- data/lib/wgit/database/database.rb +22 -21
- data/lib/wgit/document.rb +594 -592
- data/lib/wgit/url.rb +306 -278
- data/lib/wgit/version.rb +1 -1
- metadata +6 -3
data/lib/wgit/document.rb
CHANGED
@@ -1,592 +1,594 @@
|
|
1
|
-
require_relative 'url'
|
2
|
-
require_relative 'utils'
|
3
|
-
require_relative 'assertable'
|
4
|
-
require 'nokogiri'
|
5
|
-
require 'json'
|
6
|
-
|
7
|
-
module Wgit
|
8
|
-
|
9
|
-
# Class modeling a HTML web document. Also doubles as a search result when
|
10
|
-
# loading Documents from the database.
|
11
|
-
#
|
12
|
-
# The initialize method dynamically initializes certain variables from the
|
13
|
-
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
14
|
-
# Document class can be easily extended allowing you to pull out the bits of
|
15
|
-
# a webpage that are important to you. See Wgit::Document.define_extension.
|
16
|
-
class Document
|
17
|
-
include Assertable
|
18
|
-
|
19
|
-
# The HTML elements that make up the visible text on a page.
|
20
|
-
# These elements are used to initialize the @text of the Document.
|
21
|
-
# See the README.md for how to add to this Array dynamically.
|
22
|
-
@@text_elements = [
|
23
|
-
:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
24
|
-
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
|
25
|
-
]
|
26
|
-
|
27
|
-
# The URL of the webpage, an instance of Wgit::Url.
|
28
|
-
attr_reader :url
|
29
|
-
|
30
|
-
# The HTML of the webpage, an instance of String.
|
31
|
-
attr_reader :html
|
32
|
-
|
33
|
-
# The Nokogiri document object initialized from @html.
|
34
|
-
attr_reader :doc
|
35
|
-
|
36
|
-
# The score is only used following a Database#search and records matches.
|
37
|
-
attr_reader :score
|
38
|
-
|
39
|
-
# Initialize takes either two strings (representing the URL and HTML) or an
|
40
|
-
# object representing a database record (of a HTTP crawled web page). This
|
41
|
-
# allows for initialisation from both crawled web pages and (afterwards)
|
42
|
-
# documents/web pages retrieved from the database.
|
43
|
-
#
|
44
|
-
# During initialisation, the Document will call any
|
45
|
-
# 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
|
46
|
-
# default init_* methods exist while others can be defined by the user.
|
47
|
-
# See the README and Wgit::Document.define_extension for more info.
|
48
|
-
#
|
49
|
-
# @param url_or_obj [String, Object#fetch] Either a String representing a
|
50
|
-
# URL or a Hash-like object responding to :fetch. e.g. a MongoDB
|
51
|
-
# collection object. The Object's :fetch method should support Strings as
|
52
|
-
# keys.
|
53
|
-
# @param html [String] The crawled web page's HTML. This param is only
|
54
|
-
# required if url_or_obj is a String representing the web page's URL.
|
55
|
-
def initialize(url_or_obj, html = "")
|
56
|
-
# Init from URL String and HTML String.
|
57
|
-
if url_or_obj.is_a?(String)
|
58
|
-
url = url_or_obj
|
59
|
-
assert_type(url, Wgit::Url)
|
60
|
-
|
61
|
-
@url = url
|
62
|
-
@html = html ||= ""
|
63
|
-
@doc = init_nokogiri
|
64
|
-
@score = 0.0
|
65
|
-
|
66
|
-
process_url_and_html
|
67
|
-
|
68
|
-
# Dynamically run the init_*_from_html methods.
|
69
|
-
Document.private_instance_methods(false).each do |method|
|
70
|
-
if method.to_s.start_with?("init_") &&
|
71
|
-
method.to_s.end_with?("_from_html")
|
72
|
-
self.send(method)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
# Init from a Hash like object containing Strings as keys e.g. Mongo
|
76
|
-
# collection obj.
|
77
|
-
else
|
78
|
-
obj = url_or_obj
|
79
|
-
assert_respond_to(obj, :fetch)
|
80
|
-
|
81
|
-
@url = obj.fetch("url") # Should always be present.
|
82
|
-
@html = obj.fetch("html", "")
|
83
|
-
@doc = init_nokogiri
|
84
|
-
@score = obj.fetch("score", 0.0)
|
85
|
-
|
86
|
-
process_url_and_html
|
87
|
-
|
88
|
-
# Dynamically run the init_*_from_object methods.
|
89
|
-
Document.private_instance_methods(false).each do |method|
|
90
|
-
if method.to_s.start_with?("init_") &&
|
91
|
-
method.to_s.end_with?("_from_object")
|
92
|
-
self.send(method, obj)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
# Determines if both the url and html match. Use
|
99
|
-
# doc.object_id == other_doc.object_id for exact object comparison.
|
100
|
-
#
|
101
|
-
# @param other_doc [Wgit::Document] To compare self against.
|
102
|
-
# @return [Boolean] True if @url and @html are equal, false if not.
|
103
|
-
def ==(other_doc)
|
104
|
-
return false unless other_doc.is_a? Wgit::Document
|
105
|
-
@url == other_doc.url and @html == other_doc.html
|
106
|
-
end
|
107
|
-
|
108
|
-
# Is a shortcut for calling Document#html[range].
|
109
|
-
#
|
110
|
-
# @param range [Range] The range of @html to return.
|
111
|
-
# @return [String] The given range of @html.
|
112
|
-
def [](range)
|
113
|
-
@html[range]
|
114
|
-
end
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
#
|
124
|
-
#
|
125
|
-
#
|
126
|
-
#
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
#
|
137
|
-
#
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
#
|
147
|
-
#
|
148
|
-
#
|
149
|
-
#
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
#
|
188
|
-
#
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
#
|
197
|
-
#
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
#
|
206
|
-
#
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
#
|
225
|
-
#
|
226
|
-
#
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
#
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
#
|
255
|
-
#
|
256
|
-
#
|
257
|
-
#
|
258
|
-
#
|
259
|
-
#
|
260
|
-
#
|
261
|
-
#
|
262
|
-
#
|
263
|
-
#
|
264
|
-
#
|
265
|
-
# @
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
#
|
292
|
-
#
|
293
|
-
#
|
294
|
-
#
|
295
|
-
#
|
296
|
-
# @
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
#
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
#
|
317
|
-
#
|
318
|
-
#
|
319
|
-
#
|
320
|
-
#
|
321
|
-
#
|
322
|
-
#
|
323
|
-
#
|
324
|
-
#
|
325
|
-
#
|
326
|
-
#
|
327
|
-
#
|
328
|
-
#
|
329
|
-
#
|
330
|
-
#
|
331
|
-
#
|
332
|
-
#
|
333
|
-
#
|
334
|
-
# @
|
335
|
-
#
|
336
|
-
#
|
337
|
-
# @
|
338
|
-
#
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
#
|
366
|
-
#
|
367
|
-
#
|
368
|
-
#
|
369
|
-
# @
|
370
|
-
#
|
371
|
-
|
372
|
-
|
373
|
-
Document.send(:remove_method, "init_#{var}
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
#
|
387
|
-
#
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
#
|
393
|
-
#
|
394
|
-
#
|
395
|
-
#
|
396
|
-
#
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
#
|
422
|
-
#
|
423
|
-
#
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
#
|
441
|
-
# @param
|
442
|
-
# @
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
#
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
xpath +=
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
str.
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
array.
|
483
|
-
array.
|
484
|
-
array.
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
@
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
alias :
|
588
|
-
alias :
|
589
|
-
alias :
|
590
|
-
alias :
|
591
|
-
|
592
|
-
|
1
|
+
require_relative 'url'
|
2
|
+
require_relative 'utils'
|
3
|
+
require_relative 'assertable'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
module Wgit
|
8
|
+
|
9
|
+
# Class modeling a HTML web document. Also doubles as a search result when
|
10
|
+
# loading Documents from the database.
|
11
|
+
#
|
12
|
+
# The initialize method dynamically initializes certain variables from the
|
13
|
+
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
14
|
+
# Document class can be easily extended allowing you to pull out the bits of
|
15
|
+
# a webpage that are important to you. See Wgit::Document.define_extension.
|
16
|
+
class Document
|
17
|
+
include Assertable
|
18
|
+
|
19
|
+
# The HTML elements that make up the visible text on a page.
|
20
|
+
# These elements are used to initialize the @text of the Document.
|
21
|
+
# See the README.md for how to add to this Array dynamically.
|
22
|
+
@@text_elements = [
|
23
|
+
:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
|
24
|
+
:main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5
|
25
|
+
]
|
26
|
+
|
27
|
+
# The URL of the webpage, an instance of Wgit::Url.
|
28
|
+
attr_reader :url
|
29
|
+
|
30
|
+
# The HTML of the webpage, an instance of String.
|
31
|
+
attr_reader :html
|
32
|
+
|
33
|
+
# The Nokogiri document object initialized from @html.
|
34
|
+
attr_reader :doc
|
35
|
+
|
36
|
+
# The score is only used following a Database#search and records matches.
|
37
|
+
attr_reader :score
|
38
|
+
|
39
|
+
# Initialize takes either two strings (representing the URL and HTML) or an
|
40
|
+
# object representing a database record (of a HTTP crawled web page). This
|
41
|
+
# allows for initialisation from both crawled web pages and (afterwards)
|
42
|
+
# documents/web pages retrieved from the database.
|
43
|
+
#
|
44
|
+
# During initialisation, the Document will call any
|
45
|
+
# 'init_*_from_html' and 'init_*_from_object' methods it can find. Some
|
46
|
+
# default init_* methods exist while others can be defined by the user.
|
47
|
+
# See the README and Wgit::Document.define_extension for more info.
|
48
|
+
#
|
49
|
+
# @param url_or_obj [String, Object#fetch] Either a String representing a
|
50
|
+
# URL or a Hash-like object responding to :fetch. e.g. a MongoDB
|
51
|
+
# collection object. The Object's :fetch method should support Strings as
|
52
|
+
# keys.
|
53
|
+
# @param html [String] The crawled web page's HTML. This param is only
|
54
|
+
# required if url_or_obj is a String representing the web page's URL.
|
55
|
+
def initialize(url_or_obj, html = "")
|
56
|
+
# Init from URL String and HTML String.
|
57
|
+
if url_or_obj.is_a?(String)
|
58
|
+
url = url_or_obj
|
59
|
+
assert_type(url, Wgit::Url)
|
60
|
+
|
61
|
+
@url = url
|
62
|
+
@html = html ||= ""
|
63
|
+
@doc = init_nokogiri
|
64
|
+
@score = 0.0
|
65
|
+
|
66
|
+
process_url_and_html
|
67
|
+
|
68
|
+
# Dynamically run the init_*_from_html methods.
|
69
|
+
Document.private_instance_methods(false).each do |method|
|
70
|
+
if method.to_s.start_with?("init_") &&
|
71
|
+
method.to_s.end_with?("_from_html")
|
72
|
+
self.send(method)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
# Init from a Hash like object containing Strings as keys e.g. Mongo
|
76
|
+
# collection obj.
|
77
|
+
else
|
78
|
+
obj = url_or_obj
|
79
|
+
assert_respond_to(obj, :fetch)
|
80
|
+
|
81
|
+
@url = obj.fetch("url") # Should always be present.
|
82
|
+
@html = obj.fetch("html", "")
|
83
|
+
@doc = init_nokogiri
|
84
|
+
@score = obj.fetch("score", 0.0)
|
85
|
+
|
86
|
+
process_url_and_html
|
87
|
+
|
88
|
+
# Dynamically run the init_*_from_object methods.
|
89
|
+
Document.private_instance_methods(false).each do |method|
|
90
|
+
if method.to_s.start_with?("init_") &&
|
91
|
+
method.to_s.end_with?("_from_object")
|
92
|
+
self.send(method, obj)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Determines if both the url and html match. Use
|
99
|
+
# doc.object_id == other_doc.object_id for exact object comparison.
|
100
|
+
#
|
101
|
+
# @param other_doc [Wgit::Document] To compare self against.
|
102
|
+
# @return [Boolean] True if @url and @html are equal, false if not.
|
103
|
+
def ==(other_doc)
|
104
|
+
return false unless other_doc.is_a? Wgit::Document
|
105
|
+
@url == other_doc.url and @html == other_doc.html
|
106
|
+
end
|
107
|
+
|
108
|
+
# Is a shortcut for calling Document#html[range].
|
109
|
+
#
|
110
|
+
# @param range [Range] The range of @html to return.
|
111
|
+
# @return [String] The given range of @html.
|
112
|
+
def [](range)
|
113
|
+
@html[range]
|
114
|
+
end
|
115
|
+
|
116
|
+
# Returns the timestamp of when this Wgit::Document was crawled.
|
117
|
+
#
|
118
|
+
# @return [Time] Time of when this Wgit::Document was crawled.
|
119
|
+
def date_crawled
|
120
|
+
@url.date_crawled
|
121
|
+
end
|
122
|
+
|
123
|
+
# Returns a Hash containing this Document's instance vars.
|
124
|
+
# Used when storing the Document in a Database e.g. MongoDB etc.
|
125
|
+
# By default the @html var is excluded from the returned Hash.
|
126
|
+
#
|
127
|
+
# @param include_html [Boolean] Whether or not to include @html in the
|
128
|
+
# returned Hash.
|
129
|
+
# @return [Hash] Containing self's instance vars.
|
130
|
+
def to_h(include_html = false)
|
131
|
+
ignore = include_html ? [] : ["@html"]
|
132
|
+
ignore << "@doc" # Always ignore "@doc"
|
133
|
+
Wgit::Utils.to_h(self, ignore)
|
134
|
+
end
|
135
|
+
|
136
|
+
# Converts this Document's to_h return value to a JSON String.
|
137
|
+
#
|
138
|
+
# @param include_html [Boolean] Whether or not to include @html in the
|
139
|
+
# returned JSON String.
|
140
|
+
# @return [String] This Document represented as a JSON String.
|
141
|
+
def to_json(include_html = false)
|
142
|
+
h = to_h(include_html)
|
143
|
+
JSON.generate(h)
|
144
|
+
end
|
145
|
+
|
146
|
+
# Returns a Hash containing this Document's instance variables and
|
147
|
+
# their :length (if they respond to it). Works dynamically so that any
|
148
|
+
# user defined extensions (and their created instance vars) will appear in
|
149
|
+
# the returned Hash as well. The number of text snippets as well as total
|
150
|
+
# number of textual bytes are always included in the returned Hash.
|
151
|
+
#
|
152
|
+
# @return [Hash] Containing self's HTML statistics.
|
153
|
+
def stats
|
154
|
+
hash = {}
|
155
|
+
instance_variables.each do |var|
|
156
|
+
# Add up the total bytes of text as well as the length.
|
157
|
+
if var == :@text
|
158
|
+
count = 0
|
159
|
+
@text.each { |t| count += t.length }
|
160
|
+
hash[:text_length] = @text.length
|
161
|
+
hash[:text_bytes] = count
|
162
|
+
# Else take the var's #length method return value.
|
163
|
+
else
|
164
|
+
next unless instance_variable_get(var).respond_to?(:length)
|
165
|
+
hash[var[1..-1].to_sym] =
|
166
|
+
instance_variable_get(var).send(:length)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
hash
|
170
|
+
end
|
171
|
+
|
172
|
+
# Determine the size of this Document's HTML.
|
173
|
+
#
|
174
|
+
# @return [Integer] The total number of bytes in @html.
|
175
|
+
def size
|
176
|
+
stats[:html]
|
177
|
+
end
|
178
|
+
|
179
|
+
# Determine if this Document's HTML is empty or not.
|
180
|
+
#
|
181
|
+
# @return [Boolean] True if @html is nil/empty, false otherwise.
|
182
|
+
def empty?
|
183
|
+
return true if @html.nil?
|
184
|
+
@html.empty?
|
185
|
+
end
|
186
|
+
|
187
|
+
# Uses Nokogiri's xpath method to search the doc's html and return the
|
188
|
+
# results.
|
189
|
+
#
|
190
|
+
# @param xpath [String] The xpath to search the @html with.
|
191
|
+
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
192
|
+
def xpath(xpath)
|
193
|
+
@doc.xpath(xpath)
|
194
|
+
end
|
195
|
+
|
196
|
+
# Uses Nokogiri's css method to search the doc's html and return the
|
197
|
+
# results.
|
198
|
+
#
|
199
|
+
# @param selector [String] The CSS selector to search the @html with.
|
200
|
+
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
201
|
+
def css(selector)
|
202
|
+
@doc.css(selector)
|
203
|
+
end
|
204
|
+
|
205
|
+
# Get all internal links of this Document in relative form. Internal
|
206
|
+
# meaning a link to another page on this website. Also see
|
207
|
+
# Wgit::Document#internal_full_links.
|
208
|
+
#
|
209
|
+
# @return [Array<Wgit::Url>] self's internal/relative URL's.
|
210
|
+
def internal_links
|
211
|
+
return [] if @links.empty?
|
212
|
+
|
213
|
+
links = @links.
|
214
|
+
reject do |link|
|
215
|
+
not link.relative_link?(base: @url.to_base)
|
216
|
+
rescue
|
217
|
+
true
|
218
|
+
end.
|
219
|
+
map(&:to_path_and_anchor)
|
220
|
+
|
221
|
+
process_arr(links)
|
222
|
+
end
|
223
|
+
|
224
|
+
# Get all internal links of this Document and append them to this
|
225
|
+
# Document's base URL making them absolute. Also see
|
226
|
+
# Wgit::Document#internal_links.
|
227
|
+
#
|
228
|
+
# @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
|
229
|
+
# form.
|
230
|
+
def internal_full_links
|
231
|
+
in_links = internal_links
|
232
|
+
return [] if in_links.empty?
|
233
|
+
in_links.map { |link| @url.to_base.concat(link) }
|
234
|
+
end
|
235
|
+
|
236
|
+
# Get all external links of this Document. External meaning a link to
|
237
|
+
# another website.
|
238
|
+
#
|
239
|
+
# @return [Array<Wgit::Url>] self's external/absolute URL's.
|
240
|
+
def external_links
|
241
|
+
return [] if @links.empty?
|
242
|
+
|
243
|
+
links = @links.
|
244
|
+
reject do |link|
|
245
|
+
link.relative_link?(base: @url.to_base)
|
246
|
+
rescue
|
247
|
+
true
|
248
|
+
end.
|
249
|
+
map(&:without_trailing_slash)
|
250
|
+
|
251
|
+
process_arr(links)
|
252
|
+
end
|
253
|
+
|
254
|
+
# Searches against the @text for the given search query.
|
255
|
+
# The number of search hits for each sentenence are recorded internally
|
256
|
+
# and used to rank/sort the search results before being returned. Where
|
257
|
+
# the Wgit::Database#search method search all documents for the most hits,
|
258
|
+
# this method searches each document's @text for the most hits.
|
259
|
+
#
|
260
|
+
# Each search result comprises of a sentence of a given length. The length
|
261
|
+
# will be based on the sentence_limit parameter or the full length of the
|
262
|
+
# original sentence, which ever is less. The algorithm obviously ensures
|
263
|
+
# that the search query is visible somewhere in the sentence.
|
264
|
+
#
|
265
|
+
# @param query [String] The value to search the document's text against.
|
266
|
+
# @param sentence_limit [Integer] The max length of each search result
|
267
|
+
# sentence.
|
268
|
+
# @return [Array<String>] Representing the search results.
|
269
|
+
def search(query, sentence_limit = 80)
|
270
|
+
raise "A search query must be provided" if query.empty?
|
271
|
+
raise "The sentence_limit value must be even" if sentence_limit.odd?
|
272
|
+
|
273
|
+
results = {}
|
274
|
+
regex = Regexp.new(query, Regexp::IGNORECASE)
|
275
|
+
|
276
|
+
@text.each do |sentence|
|
277
|
+
hits = sentence.scan(regex).count
|
278
|
+
if hits > 0
|
279
|
+
sentence.strip!
|
280
|
+
index = sentence.index(regex)
|
281
|
+
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
282
|
+
results[sentence] = hits
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
return [] if results.empty?
|
287
|
+
results = Hash[results.sort_by { |k, v| v }]
|
288
|
+
results.keys.reverse
|
289
|
+
end
|
290
|
+
|
291
|
+
# Performs a text search (see Document#search for details) but assigns the
|
292
|
+
# results to the @text instance variable. This can be used for sub search
|
293
|
+
# functionality. The original text is returned; no other reference to it
|
294
|
+
# is kept thereafter.
|
295
|
+
#
|
296
|
+
# @param query [String] The value to search the document's text against.
|
297
|
+
# @param sentence_limit [Integer] The max length of each search result
|
298
|
+
# sentence.
|
299
|
+
# @return [String] This Document's original @text value.
|
300
|
+
def search!(query, sentence_limit = 80)
|
301
|
+
orig_text = @text
|
302
|
+
@text = search(query, sentence_limit)
|
303
|
+
orig_text
|
304
|
+
end
|
305
|
+
|
306
|
+
### Document (Class) methods ###
|
307
|
+
|
308
|
+
# Returns Document.text_elements used to obtain the text in a webpage.
|
309
|
+
#
|
310
|
+
# @return [Array<Symbols>] The page elements containing visual text on a
|
311
|
+
# webpage.
|
312
|
+
def self.text_elements
|
313
|
+
@@text_elements
|
314
|
+
end
|
315
|
+
|
316
|
+
# Initialises a private instance variable with the xpath or database object
|
317
|
+
# result(s). When initialising from HTML, a true singleton value will only
|
318
|
+
# ever return one result otherwise all xpath results are returned in an
|
319
|
+
# Array. When initialising from a database object, the value is taken as
|
320
|
+
# is and singleton is only used to define the default empty value.
|
321
|
+
# If a value cannot be found (in either the HTML or database object), then
|
322
|
+
# a default will be used. The default is: singleton ? nil : [].
|
323
|
+
#
|
324
|
+
# Note that defined extensions work for both documents being crawled from
|
325
|
+
# the WWW and for documents being retrieved from the database. This
|
326
|
+
# effectively implements ORM like behavior using this class.
|
327
|
+
#
|
328
|
+
# @param var [Symbol] The name of the variable to be initialised.
|
329
|
+
# @param xpath [String] Used to find the element(s) of the webpage.
|
330
|
+
# @option options [Boolean] :singleton The singleton option determines
|
331
|
+
# whether or not the result(s) should be in an Array. If multiple
|
332
|
+
# results are found and singleton is true then the first result will be
|
333
|
+
# used. Defaults to true.
|
334
|
+
# @option options [Boolean] :text_content_only The text_content_only option
|
335
|
+
# if true will use the text content of the Nokogiri result object,
|
336
|
+
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
337
|
+
# @yield [var_value] Gives the value about to be assigned to the new var.
|
338
|
+
# The return value of the block becomes the new var value, unless nil.
|
339
|
+
# Return nil if you want to inspect but not change the var value.
|
340
|
+
# @return [Symbol] The first half of the newly created method names e.g.
|
341
|
+
# if var == "title" then :init_title is returned.
|
342
|
+
def self.define_extension(var, xpath, options = {}, &block)
|
343
|
+
default_options = { singleton: true, text_content_only: true }
|
344
|
+
options = default_options.merge(options)
|
345
|
+
|
346
|
+
# Define the private init_*_from_html method for HTML.
|
347
|
+
# Gets the HTML's xpath value and creates a var for it.
|
348
|
+
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
349
|
+
result = find_in_html(xpath, options, &block)
|
350
|
+
init_var(var, result)
|
351
|
+
end
|
352
|
+
Document.send :private, func_name
|
353
|
+
|
354
|
+
# Define the private init_*_from_object method for a Database object.
|
355
|
+
# Gets the Object's "key" value and creates a var for it.
|
356
|
+
func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
|
357
|
+
result = find_in_object(obj, var.to_s, singleton: options[:singleton], &block)
|
358
|
+
init_var(var, result)
|
359
|
+
end
|
360
|
+
Document.send :private, func_name
|
361
|
+
|
362
|
+
"init_#{var}".to_sym
|
363
|
+
end
|
364
|
+
|
365
|
+
# Removes the init_* methods created when an extension is defined.
|
366
|
+
# Therefore, this is the opposing method to Document.define_extension.
|
367
|
+
# Returns true if successful or false if the method(s) cannot be found.
|
368
|
+
#
|
369
|
+
# @param var [Symbol] The extension variable already defined.
|
370
|
+
# @return [Boolean] True if the extension var was found and removed;
|
371
|
+
# otherwise false.
|
372
|
+
def self.remove_extension(var)
|
373
|
+
Document.send(:remove_method, "init_#{var}_from_html")
|
374
|
+
Document.send(:remove_method, "init_#{var}_from_object")
|
375
|
+
true
|
376
|
+
rescue NameError
|
377
|
+
false
|
378
|
+
end
|
379
|
+
|
380
|
+
private
|
381
|
+
|
382
|
+
# Initializes the nokogiri object using @html, which must be already set.
|
383
|
+
def init_nokogiri
|
384
|
+
raise "@html must be set" unless @html
|
385
|
+
Nokogiri::HTML(@html) do |config|
|
386
|
+
# TODO: Remove #'s below when crawling in production.
|
387
|
+
#config.options = Nokogiri::XML::ParseOptions::STRICT |
|
388
|
+
# Nokogiri::XML::ParseOptions::NONET
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
# Returns an object/value from this Document's @html using the provided
|
393
|
+
# xpath param.
|
394
|
+
# singleton ? results.first (single Object) : results (Array)
|
395
|
+
# text_content_only ? result.content (String) : result (nokogiri Object)
|
396
|
+
# A block can be used to set the final value before it is returned.
|
397
|
+
# Return nil from the block if you don't want to override the value.
|
398
|
+
def find_in_html(xpath, singleton: true, text_content_only: true)
|
399
|
+
results = @doc.xpath(xpath)
|
400
|
+
|
401
|
+
if results and not results.empty?
|
402
|
+
result = if singleton
|
403
|
+
text_content_only ? results.first.content : results.first
|
404
|
+
else
|
405
|
+
text_content_only ? results.map(&:content) : results
|
406
|
+
end
|
407
|
+
else
|
408
|
+
result = singleton ? nil : []
|
409
|
+
end
|
410
|
+
|
411
|
+
singleton ? process_str(result) : process_arr(result)
|
412
|
+
|
413
|
+
if block_given?
|
414
|
+
new_result = yield(result)
|
415
|
+
result = new_result if new_result
|
416
|
+
end
|
417
|
+
|
418
|
+
result
|
419
|
+
end
|
420
|
+
|
421
|
+
# Finds a value in the obj using the key.
|
422
|
+
# singleton is used to set the value if not found in obj.
|
423
|
+
# A block can be used to set the final value before it is returned.
|
424
|
+
# Return nil from the block if you don't want to override the value.
|
425
|
+
def find_in_object(obj, key, singleton: true)
|
426
|
+
assert_respond_to(obj, :fetch)
|
427
|
+
|
428
|
+
default = singleton ? nil : []
|
429
|
+
result = obj.fetch(key.to_s, default)
|
430
|
+
singleton ? process_str(result) : process_arr(result)
|
431
|
+
|
432
|
+
if block_given?
|
433
|
+
new_result = yield(result)
|
434
|
+
result = new_result if new_result
|
435
|
+
end
|
436
|
+
|
437
|
+
result
|
438
|
+
end
|
439
|
+
|
440
|
+
# Initialises an instance variable and defines a getter method for it.
|
441
|
+
# @param var [Symbol] The name of the variable to be initialized.
|
442
|
+
# @param value [Object] The newly initialized variable's value.
|
443
|
+
# @return [Symbol] The name of the newly created getter method.
|
444
|
+
def init_var(var, value)
|
445
|
+
# instance_var_name starts with @, var_name doesn't.
|
446
|
+
var = var.to_s
|
447
|
+
var_name = (var.start_with?("@") ? var[1..-1] : var).to_sym
|
448
|
+
instance_var_name = "@#{var_name}".to_sym
|
449
|
+
|
450
|
+
instance_variable_set(instance_var_name, value)
|
451
|
+
|
452
|
+
Document.send(:define_method, var_name) do
|
453
|
+
instance_variable_get(instance_var_name)
|
454
|
+
end
|
455
|
+
end
|
456
|
+
|
457
|
+
# Takes Docuent.text_elements and returns an xpath String used to obtain
|
458
|
+
# all of the combined text.
|
459
|
+
def text_elements_xpath
|
460
|
+
xpath = ""
|
461
|
+
return xpath if @@text_elements.empty?
|
462
|
+
el_xpath = "//%s/text()"
|
463
|
+
@@text_elements.each_with_index do |el, i|
|
464
|
+
xpath += " | " unless i == 0
|
465
|
+
xpath += el_xpath % [el]
|
466
|
+
end
|
467
|
+
xpath
|
468
|
+
end
|
469
|
+
|
470
|
+
# Processes a String to make it uniform.
|
471
|
+
def process_str(str)
|
472
|
+
if str.is_a?(String)
|
473
|
+
str.encode!('UTF-8', 'UTF-8', invalid: :replace)
|
474
|
+
str.strip!
|
475
|
+
end
|
476
|
+
str
|
477
|
+
end
|
478
|
+
|
479
|
+
# Processes an Array to make it uniform.
|
480
|
+
def process_arr(array)
|
481
|
+
if array.is_a?(Array)
|
482
|
+
array.map! { |str| process_str(str) }
|
483
|
+
array.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
484
|
+
array.compact!
|
485
|
+
array.uniq!
|
486
|
+
end
|
487
|
+
array
|
488
|
+
end
|
489
|
+
|
490
|
+
# Ensure the @url and @html Strings are correctly encoded etc.
|
491
|
+
def process_url_and_html
|
492
|
+
@url = process_str(@url)
|
493
|
+
@html = process_str(@html)
|
494
|
+
end
|
495
|
+
|
496
|
+
### Default init_* (Document extension) methods. ###
|
497
|
+
|
498
|
+
# Init methods for title.
|
499
|
+
|
500
|
+
def init_title_from_html
|
501
|
+
xpath = "//title"
|
502
|
+
result = find_in_html(xpath)
|
503
|
+
init_var(:@title, result)
|
504
|
+
end
|
505
|
+
|
506
|
+
def init_title_from_object(obj)
|
507
|
+
result = find_in_object(obj, "title")
|
508
|
+
init_var(:@title, result)
|
509
|
+
end
|
510
|
+
|
511
|
+
# Init methods for author.
|
512
|
+
|
513
|
+
def init_author_from_html
|
514
|
+
xpath = "//meta[@name='author']/@content"
|
515
|
+
result = find_in_html(xpath)
|
516
|
+
init_var(:@author, result)
|
517
|
+
end
|
518
|
+
|
519
|
+
def init_author_from_object(obj)
|
520
|
+
result = find_in_object(obj, "author")
|
521
|
+
init_var(:@author, result)
|
522
|
+
end
|
523
|
+
|
524
|
+
# Init methods for keywords.
|
525
|
+
|
526
|
+
def init_keywords_from_html
|
527
|
+
xpath = "//meta[@name='keywords']/@content"
|
528
|
+
result = find_in_html(xpath) do |keywords|
|
529
|
+
if keywords
|
530
|
+
keywords = keywords.split(",")
|
531
|
+
process_arr(keywords)
|
532
|
+
end
|
533
|
+
keywords
|
534
|
+
end
|
535
|
+
init_var(:@keywords, result)
|
536
|
+
end
|
537
|
+
|
538
|
+
def init_keywords_from_object(obj)
|
539
|
+
result = find_in_object(obj, "keywords", singleton: false)
|
540
|
+
init_var(:@keywords, result)
|
541
|
+
end
|
542
|
+
|
543
|
+
# Init methods for links.
|
544
|
+
|
545
|
+
def init_links_from_html
|
546
|
+
# Any element with a href or src attribute is considered a link.
|
547
|
+
xpath = '//*/@href | //*/@src'
|
548
|
+
result = find_in_html(xpath, singleton: false) do |links|
|
549
|
+
if links
|
550
|
+
links.map! do |link|
|
551
|
+
begin
|
552
|
+
Wgit::Url.new(link)
|
553
|
+
rescue
|
554
|
+
nil
|
555
|
+
end
|
556
|
+
end
|
557
|
+
links.compact!
|
558
|
+
end
|
559
|
+
links
|
560
|
+
end
|
561
|
+
init_var(:@links, result)
|
562
|
+
end
|
563
|
+
|
564
|
+
def init_links_from_object(obj)
|
565
|
+
result = find_in_object(obj, "links", singleton: false) do |links|
|
566
|
+
if links
|
567
|
+
links.map! { |link| Wgit::Url.new(link) }
|
568
|
+
end
|
569
|
+
links
|
570
|
+
end
|
571
|
+
init_var(:@links, result)
|
572
|
+
end
|
573
|
+
|
574
|
+
# Init methods for text.
|
575
|
+
|
576
|
+
def init_text_from_html
|
577
|
+
xpath = text_elements_xpath
|
578
|
+
result = find_in_html(xpath, singleton: false)
|
579
|
+
init_var(:@text, result)
|
580
|
+
end
|
581
|
+
|
582
|
+
def init_text_from_object(obj)
|
583
|
+
result = find_in_object(obj, "text", singleton: false)
|
584
|
+
init_var(:@text, result)
|
585
|
+
end
|
586
|
+
|
587
|
+
alias :to_hash :to_h
|
588
|
+
alias :relative_links :internal_links
|
589
|
+
alias :relative_urls :internal_links
|
590
|
+
alias :relative_full_links :internal_full_links
|
591
|
+
alias :relative_full_urls :internal_full_links
|
592
|
+
alias :external_urls :external_links
|
593
|
+
end
|
594
|
+
end
|