duo_board_crawling 0.0.9 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/duo_board_crawling.rb +76 -64
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: de8524202eaf3906226695502f5ce53f5ab42c987ecab39d4435d7dea5cbd954
|
4
|
+
data.tar.gz: 1b5f4944e962b6f8a9de46b2e0076f7873c4bac5f7e5d368f11dbb53a895a777
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b36a9b073b61afe53c1099bf28015b046638a4cd0c1e5a4d055457e17b16fdd2c8b14c16f85e1977e7d32ba06152f6ae987c262f0d5eb66e5a30dc9998b198c
|
7
|
+
data.tar.gz: 326a4b118bddc0f6a3f96e37ba77ccc11aaf1301e82dd3d43a1a0e13506e5adefc8e7cc3eaf0c2b656ed1f3631dc4e61a3bb16021c4e0b5e47050e1bb2bd97cf
|
data/lib/duo_board_crawling.rb
CHANGED
@@ -17,37 +17,37 @@ using Rainbow
|
|
17
17
|
|
18
18
|
class Naver
|
19
19
|
def initialize
|
20
|
-
|
21
|
-
|
20
|
+
# kill_selenium_chrome #기존 창 모두 닫는 명령
|
21
|
+
# sleep(1)
|
22
22
|
end
|
23
23
|
|
24
|
-
def kill_selenium_chrome #기존 창 모두 닫는 코드
|
25
|
-
|
26
|
-
|
24
|
+
#def kill_selenium_chrome #기존 창 모두 닫는 코드
|
25
|
+
# wmi = WIN32OLE.connect("winmgmts://")
|
26
|
+
# chrome_procs = wmi.ExecQuery("SELECT * FROM Win32_Process WHERE Name = 'chrome.exe'")
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
28
|
+
# chrome_procs.each do |proc|
|
29
|
+
# cmd = proc.CommandLine
|
30
|
+
# if cmd && cmd.include?("user-data-dir=C:/scraping_cookie")
|
31
|
+
# puts "→ 크롬 창 초기화: PID #{proc.ProcessId}"
|
32
|
+
# begin
|
33
|
+
# proc.Terminate
|
34
|
+
# rescue
|
35
35
|
#puts "→ 이미 종료된 프로세스: #{proc.ProcessId}"
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
# end
|
37
|
+
# end
|
38
|
+
# end
|
39
39
|
|
40
40
|
# chromedriver도 같이 종료
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
41
|
+
# chromedrivers = wmi.ExecQuery("SELECT * FROM Win32_Process WHERE Name = 'chromedriver.exe'")
|
42
|
+
# chromedrivers.each do |proc|
|
43
|
+
# puts "→ 크롬 창 초기화: PID #{proc.ProcessId}"
|
44
|
+
# begin
|
45
|
+
# proc.Terminate
|
46
|
+
# rescue
|
47
47
|
#puts "→ 이미 종료된 chromedriver: #{proc.ProcessId}"
|
48
|
-
|
49
|
-
|
50
|
-
end
|
48
|
+
# end
|
49
|
+
# end
|
50
|
+
#end
|
51
51
|
|
52
52
|
def chrome_setup(proxy, scraping_cookie_dir = "C:/scraping_cookie")
|
53
53
|
scraping_cookie_dir = "C:/scraping_cookie"
|
@@ -1592,7 +1592,7 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
1592
1592
|
|
1593
1593
|
|
1594
1594
|
# elements를 찾는 xpath 수정: href에 "/bbs/board.php?"가 포함된 링크만 추출
|
1595
|
-
elements = @driver.find_elements(xpath: '//
|
1595
|
+
elements = @driver.find_elements(xpath: '//a[contains(@href, "/bbs/board.php?")]')
|
1596
1596
|
sleep(1)
|
1597
1597
|
|
1598
1598
|
|
@@ -1625,20 +1625,23 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
1625
1625
|
end
|
1626
1626
|
sleep(sleep_delay)
|
1627
1627
|
# data-hveid="CAkQAA" 요소가 발견되면 다른 랜덤 단어로 재검색
|
1628
|
-
if @driver.find_elements(xpath: '//*[@
|
1628
|
+
if @driver.find_elements(xpath: '//*[@data-meta-area="web_lis"]').empty? ||
|
1629
|
+
@driver.find_elements(xpath: '//a[@role="button" and contains(@class, "btn_next") and @aria-disabled="true"]').any?
|
1630
|
+
|
1629
1631
|
if option['랜덤단어사용'] == 'true'
|
1630
1632
|
puts "검색 결과가 제한적이거나 더 이상 없습니다. 다른 랜덤 단어로 재시도.".red
|
1631
|
-
@keyword_input = random_words.sample
|
1633
|
+
@keyword_input = random_words.sample
|
1632
1634
|
puts "새로운 랜덤 단어 사용: #{@keyword_input}"
|
1633
|
-
start_index = 1
|
1634
|
-
previous_keyword = @keyword_input
|
1635
|
-
next
|
1635
|
+
start_index = 1
|
1636
|
+
previous_keyword = @keyword_input
|
1637
|
+
next
|
1636
1638
|
else
|
1637
1639
|
puts "검색 결과가 더 이상 없습니다. 수집을 종료합니다.".red
|
1638
|
-
no_more_results = true
|
1640
|
+
no_more_results = true
|
1639
1641
|
break
|
1640
1642
|
end
|
1641
1643
|
end
|
1644
|
+
|
1642
1645
|
|
1643
1646
|
# 수집 목표 개수에 도달했으면 종료
|
1644
1647
|
break if collected_count >= @counter # 수집 목표 개수에 도달했으면 종료
|
@@ -1726,7 +1729,7 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
1726
1729
|
|
1727
1730
|
|
1728
1731
|
# elements를 찾는 xpath 수정: href에 "/bbs/board.php?"가 포함된 링크만 추출
|
1729
|
-
elements = @driver.find_elements(xpath: '//
|
1732
|
+
elements = @driver.find_elements(xpath: '//a[contains(@href, "/xe/")]')
|
1730
1733
|
sleep(1)
|
1731
1734
|
|
1732
1735
|
|
@@ -1759,17 +1762,19 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
1759
1762
|
end
|
1760
1763
|
sleep(sleep_delay)
|
1761
1764
|
# data-hveid="CAkQAA" 요소가 발견되면 다른 랜덤 단어로 재검색
|
1762
|
-
if @driver.find_elements(xpath: '//*[@
|
1765
|
+
if @driver.find_elements(xpath: '//*[@data-meta-area="web_lis"]').empty? ||
|
1766
|
+
@driver.find_elements(xpath: '//a[@role="button" and contains(@class, "btn_next") and @aria-disabled="true"]').any?
|
1767
|
+
|
1763
1768
|
if option['랜덤단어사용'] == 'true'
|
1764
1769
|
puts "검색 결과가 제한적이거나 더 이상 없습니다. 다른 랜덤 단어로 재시도.".red
|
1765
|
-
@keyword_input = random_words.sample
|
1770
|
+
@keyword_input = random_words.sample
|
1766
1771
|
puts "새로운 랜덤 단어 사용: #{@keyword_input}"
|
1767
|
-
start_index = 1
|
1768
|
-
previous_keyword = @keyword_input
|
1769
|
-
next
|
1772
|
+
start_index = 1
|
1773
|
+
previous_keyword = @keyword_input
|
1774
|
+
next
|
1770
1775
|
else
|
1771
1776
|
puts "검색 결과가 더 이상 없습니다. 수집을 종료합니다.".red
|
1772
|
-
no_more_results = true
|
1777
|
+
no_more_results = true
|
1773
1778
|
break
|
1774
1779
|
end
|
1775
1780
|
end
|
@@ -1865,8 +1870,8 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
1865
1870
|
|
1866
1871
|
# elements를 찾는 xpath 수정: href에 "/bbs/board.php?"가 포함된 링크만 추출
|
1867
1872
|
elements = []
|
1868
|
-
elements += @driver.find_elements(xpath: '//
|
1869
|
-
elements += @driver.find_elements(xpath: '//
|
1873
|
+
elements += @driver.find_elements(xpath: '//a[contains(@href, "YToxOntzOjEyOiJrZXl3b3JkX3R5cGUiO3M6MzoiYWxsIjt9")]')
|
1874
|
+
elements += @driver.find_elements(xpath: '//a[contains(@href, "YToyOntzOjEyOiJrZXl3b3JkX3R5cGUiO3M6MzoiYWxsIjtzOjQ6InBhZ2UiO2k6NTt9")]')
|
1870
1875
|
|
1871
1876
|
# 2. 추출된 요소들을 처리
|
1872
1877
|
sleep(1)
|
@@ -1901,17 +1906,19 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
1901
1906
|
end
|
1902
1907
|
sleep(sleep_delay)
|
1903
1908
|
# data-hveid="CAkQAA" 요소가 발견되면 다른 랜덤 단어로 재검색
|
1904
|
-
if @driver.find_elements(xpath: '//*[@
|
1909
|
+
if @driver.find_elements(xpath: '//*[@data-meta-area="web_lis"]').empty? ||
|
1910
|
+
@driver.find_elements(xpath: '//a[@role="button" and contains(@class, "btn_next") and @aria-disabled="true"]').any?
|
1911
|
+
|
1905
1912
|
if option['랜덤단어사용'] == 'true'
|
1906
1913
|
puts "검색 결과가 제한적이거나 더 이상 없습니다. 다른 랜덤 단어로 재시도.".red
|
1907
|
-
@keyword_input = random_words.sample
|
1914
|
+
@keyword_input = random_words.sample
|
1908
1915
|
puts "새로운 랜덤 단어 사용: #{@keyword_input}"
|
1909
|
-
start_index = 1
|
1910
|
-
previous_keyword = @keyword_input
|
1911
|
-
next
|
1916
|
+
start_index = 1
|
1917
|
+
previous_keyword = @keyword_input
|
1918
|
+
next
|
1912
1919
|
else
|
1913
1920
|
puts "검색 결과가 더 이상 없습니다. 수집을 종료합니다.".red
|
1914
|
-
no_more_results = true
|
1921
|
+
no_more_results = true
|
1915
1922
|
break
|
1916
1923
|
end
|
1917
1924
|
end
|
@@ -2004,10 +2011,10 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
2004
2011
|
|
2005
2012
|
# elements를 찾는 xpath 수정: href에 "/bbs/board.php?"가 포함된 링크만 추출
|
2006
2013
|
elements = []
|
2014
|
+
elements += @driver.find_elements(xpath: '//a[contains(@href, "?page_id=")]')
|
2015
|
+
elements += @driver.find_elements(xpath: '//a[contains(@href, "?pageid=")]')
|
2016
|
+
elements += @driver.find_elements(xpath: '//a[contains(@href, "/?kboard_id=")]')
|
2007
2017
|
|
2008
|
-
elements += @driver.find_elements(xpath: '//li[@class="bx"]//div[@class="total_wrap"]//div[@class="total_tit_group"]//div[@class="total_source"]//div[@class="source_box"]//a[contains(@class, "thumb") and contains(@href, "?page_id=")]')
|
2009
|
-
elements += @driver.find_elements(xpath: '//li[@class="bx"]//div[@class="total_wrap"]//div[@class="total_tit_group"]//div[@class="total_source"]//div[@class="source_box"]//a[contains(@class, "thumb") and contains(@href, "?pageid=")]')
|
2010
|
-
elements += @driver.find_elements(xpath: '//li[@class="bx"]//div[@class="total_wrap"]//div[@class="total_tit_group"]//div[@class="total_source"]//div[@class="source_box"]//a[contains(@class, "thumb") and contains(@href, "/?kboard_id=")]')
|
2011
2018
|
# 2. 추출된 요소들을 처리
|
2012
2019
|
sleep(1)
|
2013
2020
|
|
@@ -2041,17 +2048,19 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
2041
2048
|
end
|
2042
2049
|
sleep(sleep_delay)
|
2043
2050
|
# data-hveid="CAkQAA" 요소가 발견되면 다른 랜덤 단어로 재검색
|
2044
|
-
if @driver.find_elements(xpath: '//*[@
|
2051
|
+
if @driver.find_elements(xpath: '//*[@data-meta-area="web_lis"]').empty? ||
|
2052
|
+
@driver.find_elements(xpath: '//a[@role="button" and contains(@class, "btn_next") and @aria-disabled="true"]').any?
|
2053
|
+
|
2045
2054
|
if option['랜덤단어사용'] == 'true'
|
2046
2055
|
puts "검색 결과가 제한적이거나 더 이상 없습니다. 다른 랜덤 단어로 재시도.".red
|
2047
|
-
@keyword_input = random_words.sample
|
2056
|
+
@keyword_input = random_words.sample
|
2048
2057
|
puts "새로운 랜덤 단어 사용: #{@keyword_input}"
|
2049
|
-
start_index = 1
|
2050
|
-
previous_keyword = @keyword_input
|
2051
|
-
next
|
2058
|
+
start_index = 1
|
2059
|
+
previous_keyword = @keyword_input
|
2060
|
+
next
|
2052
2061
|
else
|
2053
2062
|
puts "검색 결과가 더 이상 없습니다. 수집을 종료합니다.".red
|
2054
|
-
no_more_results = true
|
2063
|
+
no_more_results = true
|
2055
2064
|
break
|
2056
2065
|
end
|
2057
2066
|
end
|
@@ -2135,15 +2144,15 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
2135
2144
|
end
|
2136
2145
|
|
2137
2146
|
(0..@counter-1).each do |i|
|
2138
|
-
url = "https://search.naver.com/search.naver?nso=&page=2&query=#{@keyword_input}+/write.html
|
2147
|
+
url = "https://search.naver.com/search.naver?nso=&page=2&query=#{@keyword_input}+/write.html?+글쓰기&sm=tab_pge&start=#{start_index}&where=web"
|
2139
2148
|
#puts "요청한 URL: #{url}".yellow
|
2140
2149
|
@driver.get(url)
|
2141
2150
|
sleep(2)
|
2142
2151
|
|
2143
2152
|
|
2144
2153
|
# elements를 찾는 xpath 수정: href에 "/bbs/board.php?"가 포함된 링크만 추출
|
2145
|
-
elements = @driver.find_elements(xpath: '//
|
2146
|
-
|
2154
|
+
elements = @driver.find_elements(xpath: '//a[contains(@href, "write.html?")]')
|
2155
|
+
|
2147
2156
|
sleep(1)
|
2148
2157
|
|
2149
2158
|
|
@@ -2176,17 +2185,19 @@ def update(keyword_input, counter, option, captcha_key, sleep_delay, proxy_list)
|
|
2176
2185
|
end
|
2177
2186
|
sleep(sleep_delay)
|
2178
2187
|
# data-hveid="CAkQAA" 요소가 발견되면 다른 랜덤 단어로 재검색
|
2179
|
-
if @driver.find_elements(xpath: '//*[@
|
2188
|
+
if @driver.find_elements(xpath: '//*[@data-meta-area="web_lis"]').empty? ||
|
2189
|
+
@driver.find_elements(xpath: '//a[@role="button" and contains(@class, "btn_next") and @aria-disabled="true"]').any?
|
2190
|
+
|
2180
2191
|
if option['랜덤단어사용'] == 'true'
|
2181
2192
|
puts "검색 결과가 제한적이거나 더 이상 없습니다. 다른 랜덤 단어로 재시도.".red
|
2182
|
-
@keyword_input = random_words.sample
|
2193
|
+
@keyword_input = random_words.sample
|
2183
2194
|
puts "새로운 랜덤 단어 사용: #{@keyword_input}"
|
2184
|
-
start_index = 1
|
2185
|
-
previous_keyword = @keyword_input
|
2186
|
-
next
|
2195
|
+
start_index = 1
|
2196
|
+
previous_keyword = @keyword_input
|
2197
|
+
next
|
2187
2198
|
else
|
2188
2199
|
puts "검색 결과가 더 이상 없습니다. 수집을 종료합니다.".red
|
2189
|
-
no_more_results = true
|
2200
|
+
no_more_results = true
|
2190
2201
|
break
|
2191
2202
|
end
|
2192
2203
|
end
|
@@ -2917,6 +2928,7 @@ end
|
|
2917
2928
|
end
|
2918
2929
|
|
2919
2930
|
|
2931
|
+
|
2920
2932
|
class Wordpress
|
2921
2933
|
include Glimmer
|
2922
2934
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: duo_board_crawling
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zon
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-
|
10
|
+
date: 2025-07-14 00:00:00.000000000 Z
|
11
11
|
dependencies: []
|
12
12
|
description: File to Clipboard gem
|
13
13
|
email: mymin26@naver.com
|