scraper2-hj3415 0.0.3__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. scraper2_hj3415/cli.py +35 -11
  2. scraper2_hj3415/miscrapy/mi/__init__.py +0 -0
  3. scraper2_hj3415/miscrapy/mi/items.py +7 -0
  4. scraper2_hj3415/miscrapy/mi/middlewares.py +103 -0
  5. scraper2_hj3415/miscrapy/mi/pipelines.py +39 -0
  6. scraper2_hj3415/miscrapy/mi/settings.py +103 -0
  7. scraper2_hj3415/miscrapy/mi/spiders/__init__.py +4 -0
  8. scraper2_hj3415/miscrapy/mi/spiders/aud.py +23 -0
  9. scraper2_hj3415/miscrapy/mi/spiders/chf.py +25 -0
  10. scraper2_hj3415/miscrapy/mi/spiders/gbond3y.py +24 -0
  11. scraper2_hj3415/miscrapy/mi/spiders/gold.py +25 -0
  12. scraper2_hj3415/miscrapy/mi/spiders/kosdaq.py +23 -0
  13. scraper2_hj3415/miscrapy/mi/spiders/kospi.py +23 -0
  14. scraper2_hj3415/miscrapy/mi/spiders/mihistory.py +333 -0
  15. scraper2_hj3415/miscrapy/mi/spiders/silver.py +25 -0
  16. scraper2_hj3415/miscrapy/mi/spiders/sp500.py +24 -0
  17. scraper2_hj3415/miscrapy/mi/spiders/usdidx.py +30 -0
  18. scraper2_hj3415/miscrapy/mi/spiders/usdkrw.py +24 -0
  19. scraper2_hj3415/miscrapy/mi/spiders/wti.py +25 -0
  20. scraper2_hj3415/miscrapy/run.py +156 -0
  21. scraper2_hj3415/miscrapy/scrapy.cfg +11 -0
  22. scraper2_hj3415/nfscrapy/nfs/items.py +24 -0
  23. scraper2_hj3415/nfscrapy/nfs/pipelines.py +94 -16
  24. scraper2_hj3415/nfscrapy/nfs/settings.py +2 -1
  25. scraper2_hj3415/nfscrapy/nfs/spiders/_c108.py +95 -0
  26. scraper2_hj3415/nfscrapy/nfs/spiders/c101.py +3 -11
  27. scraper2_hj3415/nfscrapy/nfs/spiders/c103.py +174 -0
  28. scraper2_hj3415/nfscrapy/nfs/spiders/c104.py +228 -0
  29. scraper2_hj3415/nfscrapy/nfs/spiders/c106.py +92 -0
  30. scraper2_hj3415/nfscrapy/nfs/spiders/common.py +60 -0
  31. scraper2_hj3415/nfscrapy/run.py +76 -26
  32. {scraper2_hj3415-0.0.3.dist-info → scraper2_hj3415-0.1.1.dist-info}/METADATA +1 -1
  33. scraper2_hj3415-0.1.1.dist-info/RECORD +42 -0
  34. scraper2_hj3415-0.1.1.dist-info/entry_points.txt +2 -0
  35. scraper2_hj3415-0.0.3.dist-info/RECORD +0 -17
  36. scraper2_hj3415-0.0.3.dist-info/entry_points.txt +0 -2
  37. {scraper2_hj3415-0.0.3.dist-info → scraper2_hj3415-0.1.1.dist-info}/WHEEL +0 -0
  38. {scraper2_hj3415-0.0.3.dist-info → scraper2_hj3415-0.1.1.dist-info}/licenses/LICENSE +0 -0
scraper2_hj3415/cli.py CHANGED
@@ -1,23 +1,47 @@
1
1
  from .nfscrapy import run
2
2
  from krx_hj3415 import krx
3
+ from util_hj3415 import utils
3
4
  import argparse
4
5
 
5
6
 
6
- def c101_one(code: str):
7
- run.c101([code, ], "mongodb://192.168.0.173:27017")
7
+ def nfscraper():
8
+ spiders = ['c101', 'c106', 'c103', 'c104']
8
9
 
9
-
10
- def c101_all():
11
- run.c101(krx.get_codes(), "mongodb://192.168.0.173:27017")
12
-
13
-
14
- def main():
15
10
  parser = argparse.ArgumentParser()
16
- parser.add_argument('-c', '--code', action="store", help='code')
11
+ parser.add_argument('spider', help=f"Spiders - {spiders}")
12
+ parser.add_argument('target', help="Target for scraping (type 6digit code or 'all')")
13
+
14
+ parser.add_argument('-d', '--db_path', help="Set mongo database path")
17
15
  args = parser.parse_args()
18
16
 
19
- c101_one(args.code)
17
+ if args.spider in spiders:
18
+ if args.spider == 'c101':
19
+ if args.target == 'all':
20
+ run.c101(krx.get_codes(), args.db_path) if args.db_path else run.c101(krx.get_codes())
21
+ elif utils.is_6digit(args.target):
22
+ run.c101([args.target, ], args.db_path) if args.db_path else run.c101([args.target, ])
23
+ if args.spider == 'c103':
24
+ if args.target == 'all':
25
+ x = input("It will take a long time. Are you sure? (y/N)")
26
+ if x == 'y' or x == 'Y':
27
+ run.c103(krx.get_codes(), args.db_path) if args.db_path else run.c103(krx.get_codes())
28
+ elif utils.is_6digit(args.target):
29
+ run.c103([args.target, ], args.db_path) if args.db_path else run.c103([args.target, ])
30
+ if args.spider == 'c104':
31
+ if args.target == 'all':
32
+ x = input("It will take a long time. Are you sure? (y/N)")
33
+ if x == 'y' or x == 'Y':
34
+ run.c104(krx.get_codes(), args.db_path) if args.db_path else run.c104(krx.get_codes())
35
+ elif utils.is_6digit(args.target):
36
+ run.c104([args.target, ], args.db_path) if args.db_path else run.c104([args.target, ])
37
+ if args.spider == 'c106':
38
+ if args.target == 'all':
39
+ run.c106(krx.get_codes(), args.db_path) if args.db_path else run.c106(krx.get_codes())
40
+ elif utils.is_6digit(args.target):
41
+ run.c106([args.target, ], args.db_path) if args.db_path else run.c106([args.target, ])
42
+ else:
43
+ print(f"The spider option should be in {spiders}")
20
44
 
21
45
 
22
46
  if __name__ == '__main__':
23
- main()
47
+ nfscraper()
File without changes
@@ -0,0 +1,7 @@
1
+ import scrapy
2
+
3
+
4
+ class MIitems(scrapy.Item):
5
+ title = scrapy.Field()
6
+ date = scrapy.Field()
7
+ value = scrapy.Field()
@@ -0,0 +1,103 @@
1
+ # Define here the models for your spider middleware
2
+ #
3
+ # See documentation in:
4
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5
+
6
+ from scrapy import signals
7
+
8
+ # useful for handling different item types with a single interface
9
+ from itemadapter import is_item, ItemAdapter
10
+
11
+
12
+ class MiSpiderMiddleware:
13
+ # Not all methods need to be defined. If a method is not defined,
14
+ # scrapy acts as if the spider middleware does not modify the
15
+ # passed objects.
16
+
17
+ @classmethod
18
+ def from_crawler(cls, crawler):
19
+ # This method is used by Scrapy to create your spiders.
20
+ s = cls()
21
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22
+ return s
23
+
24
+ def process_spider_input(self, response, spider):
25
+ # Called for each response that goes through the spider
26
+ # middleware and into the spider.
27
+
28
+ # Should return None or raise an exception.
29
+ return None
30
+
31
+ def process_spider_output(self, response, result, spider):
32
+ # Called with the results returned from the Spider, after
33
+ # it has processed the response.
34
+
35
+ # Must return an iterable of Request, or item objects.
36
+ for i in result:
37
+ yield i
38
+
39
+ def process_spider_exception(self, response, exception, spider):
40
+ # Called when a spider or process_spider_input() method
41
+ # (from other spider middleware) raises an exception.
42
+
43
+ # Should return either None or an iterable of Request or item objects.
44
+ pass
45
+
46
+ def process_start_requests(self, start_requests, spider):
47
+ # Called with the start requests of the spider, and works
48
+ # similarly to the process_spider_output() method, except
49
+ # that it doesn’t have a response associated.
50
+
51
+ # Must return only requests (not items).
52
+ for r in start_requests:
53
+ yield r
54
+
55
+ def spider_opened(self, spider):
56
+ spider.logger.info("Spider opened: %s" % spider.name)
57
+
58
+
59
+ class MiDownloaderMiddleware:
60
+ # Not all methods need to be defined. If a method is not defined,
61
+ # scrapy acts as if the downloader middleware does not modify the
62
+ # passed objects.
63
+
64
+ @classmethod
65
+ def from_crawler(cls, crawler):
66
+ # This method is used by Scrapy to create your spiders.
67
+ s = cls()
68
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69
+ return s
70
+
71
+ def process_request(self, request, spider):
72
+ # Called for each request that goes through the downloader
73
+ # middleware.
74
+
75
+ # Must either:
76
+ # - return None: continue processing this request
77
+ # - or return a Response object
78
+ # - or return a Request object
79
+ # - or raise IgnoreRequest: process_exception() methods of
80
+ # installed downloader middleware will be called
81
+ return None
82
+
83
+ def process_response(self, request, response, spider):
84
+ # Called with the response returned from the downloader.
85
+
86
+ # Must either;
87
+ # - return a Response object
88
+ # - return a Request object
89
+ # - or raise IgnoreRequest
90
+ return response
91
+
92
+ def process_exception(self, request, exception, spider):
93
+ # Called when a download handler or a process_request()
94
+ # (from other downloader middleware) raises an exception.
95
+
96
+ # Must either:
97
+ # - return None: continue processing this exception
98
+ # - return a Response object: stops process_exception() chain
99
+ # - return a Request object: stops process_exception() chain
100
+ pass
101
+
102
+ def spider_opened(self, spider):
103
+ spider.logger.info("Spider opened: %s" % spider.name)
@@ -0,0 +1,39 @@
1
+
2
+ import logging
3
+ logger = logging.getLogger(__name__)
4
+ formatter = logging.Formatter('%(levelname)s: [%(name)s] %(message)s')
5
+ ch = logging.StreamHandler()
6
+ ch.setFormatter(formatter)
7
+ logger.addHandler(ch)
8
+ logger.setLevel(logging.INFO)
9
+
10
+ # 한개의 스파이더에서 연속 3일분량의 데이터가 전달된다.
11
+
12
+
13
+ class ValidationPipeline:
14
+ def process_item(self, item, spider):
15
+ pass
16
+
17
+
18
+ class MongoPipeline:
19
+ def open_spider(self, spider):
20
+ self.client = spider.mongo_client
21
+
22
+
23
+
24
+
25
+ from db_hj3415 import mongo2, dbpath
26
+
27
+ client = mongo2.connect_mongo(dbpath.load())
28
+
29
+ # 몽고 데이터 베이스에 저장하는 파이프라인
30
+ def process_item(self, item, spider):
31
+ """
32
+ 아이템 구조
33
+ title = scrapy.Field()
34
+ date = scrapy.Field()
35
+ value = scrapy.Field()
36
+ """
37
+ print(f"In the {self.__class__.__name__}...date : {item['date']} / title : {item['title']} / value : {item['value']}")
38
+ mongo2.MI(self.client, item['title']).save(mi_dict={"date": item['date'], "value": item['value']})
39
+ return item
@@ -0,0 +1,103 @@
1
+ # Scrapy settings for mi project
2
+ #
3
+ # For simplicity, this file contains only settings considered important or
4
+ # commonly used. You can find more settings consulting the documentation:
5
+ #
6
+ # https://docs.scrapy.org/en/latest/topics/settings.html
7
+ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
+
10
+ BOT_NAME = "mi"
11
+
12
+ SPIDER_MODULES = ["mi.spiders"]
13
+ NEWSPIDER_MODULE = "mi.spiders"
14
+
15
+
16
+ # Crawl responsibly by identifying yourself (and your website) on the user-agent
17
+ #USER_AGENT = "mi (+http://www.yourdomain.com)"
18
+
19
+ # Obey robots.txt rules
20
+ ROBOTSTXT_OBEY = False
21
+
22
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
23
+ #CONCURRENT_REQUESTS = 32
24
+
25
+ # Configure a delay for requests for the same website (default: 0)
26
+ # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27
+ # See also autothrottle settings and docs
28
+ DOWNLOAD_DELAY = 1
29
+ RANDOMIZE_DOWNLOAD_DELAY = True
30
+
31
+ # The download delay setting will honor only one of:
32
+ #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33
+ #CONCURRENT_REQUESTS_PER_IP = 16
34
+
35
+ # Disable cookies (enabled by default)
36
+ COOKIES_ENABLED = False
37
+
38
+ # Disable Telnet Console (enabled by default)
39
+ #TELNETCONSOLE_ENABLED = False
40
+
41
+ # Override the default request headers:
42
+ #DEFAULT_REQUEST_HEADERS = {
43
+ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
44
+ # "Accept-Language": "en",
45
+ #}
46
+
47
+ # Enable or disable spider middlewares
48
+ # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49
+ #SPIDER_MIDDLEWARES = {
50
+ # "mi.middlewares.MiSpiderMiddleware": 543,
51
+ #}
52
+
53
+ # Enable or disable downloader middlewares
54
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55
+ #DOWNLOADER_MIDDLEWARES = {
56
+ # "mi.middlewares.MiDownloaderMiddleware": 543,
57
+ #}
58
+
59
+ # Enable or disable extensions
60
+ # See https://docs.scrapy.org/en/latest/topics/extensions.html
61
+ #EXTENSIONS = {
62
+ # "scrapy.extensions.telnet.TelnetConsole": None,
63
+ #}
64
+
65
+ # Configure item pipelines
66
+ # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67
+ ITEM_PIPELINES = {
68
+ "mi.pipelines.ValidationPipeline": 300,
69
+ "mi.pipelines.MongoPipeline": 400,
70
+ }
71
+
72
+ # Enable and configure the AutoThrottle extension (disabled by default)
73
+ # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
74
+ #AUTOTHROTTLE_ENABLED = True
75
+ # The initial download delay
76
+ #AUTOTHROTTLE_START_DELAY = 5
77
+ # The maximum download delay to be set in case of high latencies
78
+ #AUTOTHROTTLE_MAX_DELAY = 60
79
+ # The average number of requests Scrapy should be sending in parallel to
80
+ # each remote server
81
+ #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82
+ # Enable showing throttling stats for every response received:
83
+ #AUTOTHROTTLE_DEBUG = False
84
+
85
+ # Enable and configure HTTP caching (disabled by default)
86
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87
+ #HTTPCACHE_ENABLED = True
88
+ #HTTPCACHE_EXPIRATION_SECS = 0
89
+ #HTTPCACHE_DIR = "httpcache"
90
+ #HTTPCACHE_IGNORE_HTTP_CODES = []
91
+ #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
92
+
93
+ # Set settings whose default value is deprecated to a future-proof value
94
+ REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
95
+ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
96
+ FEED_EXPORT_ENCODING = "utf-8"
97
+
98
+ LOG_ENABLED = False
99
+ #LOG_LEVEL = 'DEBUG'
100
+ LOG_LEVEL = 'WARNING'
101
+
102
+ # The port range to use for the telnet console. If set to None or 0, a dynamically assigned port is used.
103
+ TELNETCONSOLE_PORT = None
@@ -0,0 +1,4 @@
1
+ # This package will contain the spiders of your Scrapy project
2
+ #
3
+ # Please refer to the documentation for information on how to create and manage
4
+ # your spiders.
@@ -0,0 +1,23 @@
1
+ import scrapy
2
+ from mi import items
3
+
4
+
5
+ class AudSpider(scrapy.Spider):
6
+ name = 'aud'
7
+ allowed_domains = ['finance.naver.com']
8
+ start_urls = ['https://finance.naver.com/marketindex/worldDailyQuote.nhn?fdtc=4&marketindexCd=FX_USDAUD']
9
+
10
+ def parse(self, response):
11
+ print(f"Parsing ...'{self.name}' page", flush=True)
12
+ item = items.MIitems()
13
+ # S&P500를 스크랩하는코드
14
+ self.logger.info(response.url)
15
+ # 최근 3개일의 데이터를 스크랩한다.
16
+ for r in range(3, 0, -1):
17
+ item['title'] = self.name
18
+ item['date'] = (response.xpath(f'/html/body/div/table/tbody/tr[{r}]/td[1]/text()')
19
+ .get().replace('\t', '').replace('\n', ''))
20
+ item['value'] = (response.xpath(f'/html/body/div/table/tbody/tr[{r}]/td[2]/text()')
21
+ .get().replace(',', '').replace('\t', '').replace('\n', ''))
22
+ self.logger.info(f"date : {item['date']}, value : {item['value']}")
23
+ yield item
@@ -0,0 +1,25 @@
1
+ import scrapy
2
+ from mi import items
3
+
4
+ # cmd usage : scrapy crawl chf
5
+
6
+
7
+ class ChfSpider(scrapy.Spider):
8
+ name = 'chf'
9
+ allowed_domains = ['finance.naver.com']
10
+ start_urls = ['https://finance.naver.com/marketindex/worldDailyQuote.nhn?fdtc=4&marketindexCd=FX_USDCHF']
11
+
12
+ def parse(self, response):
13
+ print(f"Parsing ...'{self.name}' page", flush=True)
14
+ item = items.MIitems()
15
+ # S&P500를 스크랩하는코드
16
+ self.logger.info(response.url)
17
+ # 최근 3개일의 데이터를 스크랩한다.
18
+ for r in range(3, 0, -1):
19
+ item['title'] = self.name
20
+ item['date'] = (response.xpath(f'/html/body/div/table/tbody/tr[{r}]/td[1]/text()')
21
+ .get().replace('\t', '').replace('\n', ''))
22
+ item['value'] = (response.xpath(f'/html/body/div/table/tbody/tr[{r}]/td[2]/text()')
23
+ .get().replace(',', '').replace('\t', '').replace('\n', ''))
24
+ self.logger.info(f"date : {item['date']}, value : {item['value']}")
25
+ yield item
@@ -0,0 +1,24 @@
1
+ import scrapy
2
+ from mi import items
3
+
4
+ # cmd usage : scrapy crawl gbond3y
5
+
6
+
7
+ class Gbond3ySpider(scrapy.Spider):
8
+ name = 'gbond3y'
9
+ allowed_domains = ['finance.naver.com']
10
+ start_urls = ['https://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_GOVT03Y']
11
+
12
+ def parse(self, response):
13
+ print(f"Parsing ...'{self.name}' page", flush=True)
14
+ item = items.MIitems()
15
+ # 국고채 3년금리를 스크랩하는코드
16
+ self.logger.info(response.url)
17
+ for r in range(3, 0, -1):
18
+ item['title'] = self.name
19
+ item['date'] = (response.css(f'body > div > table > tbody > tr:nth-child({r}) > td.date::text')
20
+ .extract()[0].replace('\n', '').replace('\t', ''))
21
+ item['value'] = (response.css(f'body > div > table > tbody > tr:nth-child({r}) > td:nth-child(2)::text')
22
+ .extract()[0])
23
+ self.logger.info(f"date : {item['date']}, value : {item['value']}")
24
+ yield item
@@ -0,0 +1,25 @@
1
+ import scrapy
2
+ from mi import items
3
+
4
+ # cmd usage : scrapy crawl gold
5
+
6
+
7
+ class GoldSpider(scrapy.Spider):
8
+ name = 'gold'
9
+ allowed_domains = ['finance.naver.com']
10
+ start_urls = ['https://finance.naver.com/marketindex/worldDailyQuote.nhn?marketindexCd=CMDT_GC&fdtc=2']
11
+
12
+ def parse(self, response):
13
+ print(f"Parsing ...'{self.name}' page", flush=True)
14
+ item = items.MIitems()
15
+ # S&P500를 스크랩하는코드
16
+ self.logger.info(response.url)
17
+ # 최근 3개일의 데이터를 스크랩한다.
18
+ for r in range(3, 0, -1):
19
+ item['title'] = self.name
20
+ item['date'] = (response.xpath(f'/html/body/div/table/tbody/tr[{r}]/td[1]/text()')
21
+ .get().replace('\t', '').replace('\n', ''))
22
+ item['value'] = (response.xpath(f'/html/body/div/table/tbody/tr[{r}]/td[2]/text()')
23
+ .get().replace(',', '').replace('\t', '').replace('\n', ''))
24
+ self.logger.info(f"date : {item['date']}, value : {item['value']}")
25
+ yield item
@@ -0,0 +1,23 @@
1
+ import scrapy
2
+ from mi import items
3
+
4
+ # cmd usage : scrapy crawl kosdaq
5
+
6
+
7
+ class KosdaqSpider(scrapy.Spider):
8
+ name = 'kosdaq'
9
+ allowed_domains = ['finance.naver.com']
10
+ start_urls = ['https://finance.naver.com/sise/sise_index_day.nhn?code=KOSDAQ']
11
+
12
+ def parse(self, response):
13
+ print(f"Parsing ...'{self.name}' page", flush=True)
14
+ item = items.MIitems()
15
+ # KOSDAQ를 스크랩하는코드
16
+ self.logger.info(response.url)
17
+ # 최근 3개일의 데이터를 스크랩한다.
18
+ for r in range(3, 6):
19
+ item['title'] = self.name
20
+ item['date'] = response.xpath(f'/html/body/div/table[1]/tr[{r}]/td[1]/text()').get()
21
+ item['value'] = response.xpath(f'/html/body/div/table[1]/tr[{r}]/td[2]/text()').get().replace(',', '')
22
+ self.logger.info(f"date : {item['date']}, value : {item['value']}")
23
+ yield item
@@ -0,0 +1,23 @@
1
+ import scrapy
2
+ from mi import items
3
+
4
+ # cmd usage : scrapy crawl kospi
5
+
6
+
7
+ class KospiSpider(scrapy.Spider):
8
+ name = 'kospi'
9
+ allowed_domains = ['finance.naver.com']
10
+ start_urls = ['https://finance.naver.com/sise/sise_index_day.nhn?code=KOSPI']
11
+
12
+ def parse(self, response):
13
+ print(f"Parsing ...'{self.name}' page", flush=True)
14
+ item = items.MIitems()
15
+ # KOSPI를 스크랩하는코드
16
+ self.logger.info(response.url)
17
+ # 최근 3개일의 데이터를 스크랩한다.
18
+ for i in range(3, 6):
19
+ item['title'] = self.name
20
+ item['date'] = response.xpath(f'/html/body/div/table[1]/tr[{i}]/td[1]/text()').get()
21
+ item['value'] = response.xpath(f'/html/body/div/table[1]/tr[{i}]/td[2]/text()').get().replace(',', '')
22
+ self.logger.info(f"date : {item['date']}, value : {item['value']}")
23
+ yield item