title_grabber 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +34 -21
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7532ff11d949765bb08b4ae1deeefc1aea1c70e8b794139cf747cf10c948e433
|
4
|
+
data.tar.gz: f59192b55720a85dab8767b03346fc335e825d5696d20f7ac46e0880a081b96d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1102126705a068b7e7e92721c93cc283bf9b4bef28a49c7799aeb3f03825548751ebb108861e73952ba79de57620d4e9390a6a95df32b9a04cc2e7a75df704c5
|
7
|
+
data.tar.gz: 24f10e575e1f5b706877e952e7fefdcba17a589a3a0f0282c697d6b1882e79f40066bac01509e5d3ed43d50f2102e68ff3f2baae5c5bd9c3eecd295e86f5af03
|
data/Gemfile.lock
CHANGED
data/lib/title_grabber.rb
CHANGED
@@ -16,32 +16,23 @@ module TitleGrabber
|
|
16
16
|
ART_TIT_HEAD = -"article_title"
|
17
17
|
HEADERS = [URL_HEADER, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
|
18
18
|
|
19
|
-
|
19
|
+
def self.call(lines, options)
|
20
|
+
MultiThreadedGrabber.new(lines, options).call
|
21
|
+
end
|
22
|
+
|
23
|
+
class MultiThreadedGrabber
|
20
24
|
include HTTPHelper
|
21
25
|
include TextHelper
|
22
26
|
|
23
|
-
|
24
|
-
out_path = options[:output]
|
25
|
-
|
26
|
-
processed_urls = if out_path.exist?
|
27
|
-
arr_of_h = CSV.read(out_path, headers: true)
|
28
|
-
arr_of_h.each_with_object({}) { |r, h|
|
29
|
-
page_tit = r[PAGE_TIT_HEAD]
|
30
|
-
art_tit = r[ART_TIT_HEAD]
|
31
|
-
|
32
|
-
unless page_tit.empty? && art_tit.empty?
|
33
|
-
h[r[URL_HEADER]] = { PAGE_TIT_HEAD => page_tit,
|
34
|
-
ART_TIT_HEAD => art_tit }
|
35
|
-
end
|
36
|
-
}.tap do
|
37
|
-
arr_of_h = nil
|
38
|
-
end
|
39
|
-
else
|
40
|
-
{}
|
41
|
-
end
|
27
|
+
attr_reader :lines, :out_path
|
42
28
|
|
43
|
-
|
29
|
+
def initialize(lines, options)
|
30
|
+
@lines = lines
|
31
|
+
@out_path = options[:output]
|
32
|
+
end
|
44
33
|
|
34
|
+
def call
|
35
|
+
queue = Queue.new
|
45
36
|
tmp_path = out_path.sub_ext(".tmp#{out_path.extname}")
|
46
37
|
CSV.open(tmp_path, "w", force_quotes: true) do |csv|
|
47
38
|
csv << HEADERS
|
@@ -89,5 +80,27 @@ module TitleGrabber
|
|
89
80
|
|
90
81
|
FileUtils.mv(tmp_path, out_path)
|
91
82
|
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def processed_urls
|
87
|
+
@processed_urls ||= begin
|
88
|
+
urls = {}
|
89
|
+
|
90
|
+
if out_path.exist?
|
91
|
+
CSV.foreach(out_path, headers: true) do |r|
|
92
|
+
page_tit = r[PAGE_TIT_HEAD]
|
93
|
+
art_tit = r[ART_TIT_HEAD]
|
94
|
+
|
95
|
+
unless page_tit.empty? && art_tit.empty?
|
96
|
+
urls[r[URL_HEADER]] = { PAGE_TIT_HEAD => page_tit,
|
97
|
+
ART_TIT_HEAD => art_tit }
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
urls
|
103
|
+
end
|
104
|
+
end
|
92
105
|
end
|
93
106
|
end
|