nytimes-scraper-fork 1.1.2.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. nytimes_scraper_fork-1.1.2.dev1/PKG-INFO +94 -0
  2. nytimes_scraper_fork-1.1.2.dev1/README.md +66 -0
  3. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/__init__.py +3 -0
  4. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/__main__.py +6 -0
  5. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/articles/__init__.py +2 -0
  6. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/articles/postprocessing.py +16 -0
  7. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/articles/scraper.py +41 -0
  8. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/comments/__init__.py +2 -0
  9. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/comments/postprocessing.py +23 -0
  10. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/comments/scraper.py +77 -0
  11. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/comments/util.py +24 -0
  12. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/__init__.py +1 -0
  13. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/api.py +10 -0
  14. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/archive.py +19 -0
  15. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/community.py +38 -0
  16. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/newswire.py +19 -0
  17. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/scraper.py +65 -0
  18. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/PKG-INFO +94 -0
  19. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/SOURCES.txt +22 -0
  20. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/dependency_links.txt +1 -0
  21. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/requires.txt +6 -0
  22. nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/top_level.txt +1 -0
  23. nytimes_scraper_fork-1.1.2.dev1/setup.cfg +4 -0
  24. nytimes_scraper_fork-1.1.2.dev1/setup.py +36 -0
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: nytimes-scraper-fork
3
+ Version: 1.1.2.dev1
4
+ Summary: Scrape article metadata and comments from NYTimes
5
+ Home-page: https://github.com/ietz/nytimes-scraper
6
+ Author: Tim Pietz
7
+ Author-email: tim@pietz.me
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: cssselect
14
+ Requires-Dist: fire
15
+ Requires-Dist: lxml
16
+ Requires-Dist: pandas
17
+ Requires-Dist: requests
18
+ Requires-Dist: tqdm
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description
23
+ Dynamic: description-content-type
24
+ Dynamic: home-page
25
+ Dynamic: license
26
+ Dynamic: requires-dist
27
+ Dynamic: summary
28
+
29
+ # nytimes-scraper
30
+
31
+ [![PyPI](https://img.shields.io/pypi/v/nytimes-scraper)](https://pypi.org/project/nytimes-scraper/)
32
+
33
+ Scrape article metadata and comments from NYTimes
34
+
35
+ ## Setup
36
+ ```bash
37
+ pip install nytimes-scraper
38
+ ```
39
+
40
+ ## CLI usage
41
+ The scraper will automatically fetch every article and all the user comments published on
42
+ [nytimes.com](https://www.nytimes.com/).
43
+ Articles are processed month by month, starting with the current month.
44
+ For each month, a `{year}-{month}-articles.pickle` and `{year}-{month}-comments.pickle` will be
45
+ generated in the current directory.
46
+ If the process is restarted, existing outputs will not be overridden and the scraper will continue
47
+ at the month where it left off.
48
+ To use it, run
49
+ ```bash
50
+ python -m nytimes_scraper <API_KEY>
51
+ ```
52
+
53
+ ## Programmatic usage
54
+ The scraper can also be started programmatically
55
+ ```python
56
+ import datetime as dt
57
+ from nytimes_scraper import run_scraper, scrape_month
58
+
59
+ # scrape february of 2020
60
+ article_df, comment_df = scrape_month('<your_api_key>', date=dt.date(2020, 2, 1))
61
+
62
+ # scrape all articles month by month
63
+ run_scraper('<your_api_key>')
64
+ ```
65
+
66
+ Alternatively, the `nytimes_scraper.articles` and `nytimes_scraper.comments` modules can be used for more
67
+ fine-grained access:
68
+ ```python
69
+ import datetime as dt
70
+ from nytimes_scraper.nyt_api import NytApi
71
+ from nytimes_scraper.articles import fetch_articles_by_month, articles_to_df
72
+ from nytimes_scraper.comments import fetch_comments, fetch_comments_by_article, comments_to_df
73
+
74
+ api = NytApi('<your_api_key>')
75
+
76
+ # Fetch articles of a specific month
77
+ articles = fetch_articles_by_month(api, dt.date(2020, 2, 1))
78
+ article_df = articles_to_df(articles)
79
+
80
+ # Fetch comments from multiple articles
81
+ # a) using the results of a previous article query
82
+ article_ids_and_urls = list(article_df['web_url'].iteritems())
83
+ comments_a = fetch_comments(api, article_ids_and_urls)
84
+ comment_df = comments_to_df(comments_a)
85
+
86
+ # b) using a custom list of articles
87
+ comments_b = fetch_comments(api, article_ids_and_urls=[
88
+ ('nyt://article/316ef65c-7021-5755-885c-a9e1ef2cfdf2', 'https://www.nytimes.com/2020/01/03/world/middleeast/trump-iran-suleimani.html'),
89
+ ('nyt://article/b2d1b802-412e-51f7-8864-efc931e87bb3', 'https://www.nytimes.com/2020/01/04/opinion/impeachment-witnesses.html'),
90
+ ])
91
+
92
+ # Fetch comment for one specific article by its URL
93
+ comments_c = fetch_comments_by_article(api, 'https://www.nytimes.com/2019/11/30/opinion/sunday/bernie-sanders.html')
94
+ ```
@@ -0,0 +1,66 @@
1
+ # nytimes-scraper
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/nytimes-scraper)](https://pypi.org/project/nytimes-scraper/)
4
+
5
+ Scrape article metadata and comments from NYTimes
6
+
7
+ ## Setup
8
+ ```bash
9
+ pip install nytimes-scraper
10
+ ```
11
+
12
+ ## CLI usage
13
+ The scraper will automatically fetch every article and all the user comments published on
14
+ [nytimes.com](https://www.nytimes.com/).
15
+ Articles are processed month by month, starting with the current month.
16
+ For each month, a `{year}-{month}-articles.pickle` and `{year}-{month}-comments.pickle` will be
17
+ generated in the current directory.
18
+ If the process is restarted, existing outputs will not be overridden and the scraper will continue
19
+ at the month where it left off.
20
+ To use it, run
21
+ ```bash
22
+ python -m nytimes_scraper <API_KEY>
23
+ ```
24
+
25
+ ## Programmatic usage
26
+ The scraper can also be started programmatically
27
+ ```python
28
+ import datetime as dt
29
+ from nytimes_scraper import run_scraper, scrape_month
30
+
31
+ # scrape february of 2020
32
+ article_df, comment_df = scrape_month('<your_api_key>', date=dt.date(2020, 2, 1))
33
+
34
+ # scrape all articles month by month
35
+ run_scraper('<your_api_key>')
36
+ ```
37
+
38
+ Alternatively, the `nytimes_scraper.articles` and `nytimes_scraper.comments` modules can be used for more
39
+ fine-grained access:
40
+ ```python
41
+ import datetime as dt
42
+ from nytimes_scraper.nyt_api import NytApi
43
+ from nytimes_scraper.articles import fetch_articles_by_month, articles_to_df
44
+ from nytimes_scraper.comments import fetch_comments, fetch_comments_by_article, comments_to_df
45
+
46
+ api = NytApi('<your_api_key>')
47
+
48
+ # Fetch articles of a specific month
49
+ articles = fetch_articles_by_month(api, dt.date(2020, 2, 1))
50
+ article_df = articles_to_df(articles)
51
+
52
+ # Fetch comments from multiple articles
53
+ # a) using the results of a previous article query
54
+ article_ids_and_urls = list(article_df['web_url'].iteritems())
55
+ comments_a = fetch_comments(api, article_ids_and_urls)
56
+ comment_df = comments_to_df(comments_a)
57
+
58
+ # b) using a custom list of articles
59
+ comments_b = fetch_comments(api, article_ids_and_urls=[
60
+ ('nyt://article/316ef65c-7021-5755-885c-a9e1ef2cfdf2', 'https://www.nytimes.com/2020/01/03/world/middleeast/trump-iran-suleimani.html'),
61
+ ('nyt://article/b2d1b802-412e-51f7-8864-efc931e87bb3', 'https://www.nytimes.com/2020/01/04/opinion/impeachment-witnesses.html'),
62
+ ])
63
+
64
+ # Fetch comment for one specific article by its URL
65
+ comments_c = fetch_comments_by_article(api, 'https://www.nytimes.com/2019/11/30/opinion/sunday/bernie-sanders.html')
66
+ ```
@@ -0,0 +1,3 @@
1
+ from nytimes_scraper.scraper import run_scraper, scrape_month
2
+
3
+ __version__ = '1.1.2'
@@ -0,0 +1,6 @@
1
+ import fire
2
+
3
+ from nytimes_scraper import run_scraper
4
+
5
+ if __name__ == '__main__':
6
+ fire.Fire(run_scraper)
@@ -0,0 +1,2 @@
1
+ from nytimes_scraper.articles.scraper import fetch_articles_by_month
2
+ from nytimes_scraper.articles.postprocessing import articles_to_df
@@ -0,0 +1,16 @@
1
+ from typing import List, Dict
2
+
3
+ import pandas as pd
4
+
5
+
6
+ def articles_to_df(articles: List[Dict]) -> pd.DataFrame:
7
+ df = pd.json_normalize(articles).set_index('_id')
8
+
9
+ if 'pub_date' in df:
10
+ df['pub_date'] = pd.to_datetime(df['pub_date'])
11
+
12
+ for col in ['document_type', 'news_desk', 'section_name', 'type_of_material', 'byline.organization', 'subsectoinName']:
13
+ if col in df:
14
+ df[col] = df[col].astype(pd.CategoricalDtype())
15
+
16
+ return df[~df.index.duplicated()]
@@ -0,0 +1,41 @@
1
+ import datetime as dt
2
+ import re
3
+ from typing import List, Dict
4
+
5
+ import lxml.html
6
+ import requests
7
+ from tqdm import tqdm
8
+
9
+ from nytimes_scraper.nyt_api.api import NytApi
10
+
11
+
12
+ def fetch_articles_by_month(api: NytApi, date: dt.date, show_progress: bool = True) -> List[Dict]:
13
+ """Fetch the article metadata for the year-month of `date`"""
14
+ print(f'Fetching articles for {date.year}-{date.month:02d}')
15
+
16
+ articles = api.archive.archive(date.year, date.month)['response']['docs']
17
+ for article in tqdm(articles, unit='Article', disable=not show_progress):
18
+ try:
19
+ article['html'] = fetch_article_html(article['web_url'])
20
+ article['text'] = scrape_article_text(article['html'])
21
+ except ValueError:
22
+ pass
23
+
24
+ return articles
25
+
26
+
27
+ def fetch_article_html(article_url: str) -> str:
28
+ response = requests.get(article_url)
29
+ response.raise_for_status()
30
+ return response.text
31
+
32
+
33
+ def scrape_article_text(article_html: str) -> str:
34
+ doc = lxml.html.fromstring(article_html)
35
+ text_nodes = doc.cssselect('section[name="articleBody"] > .StoryBodyCompanionColumn > :first-child > *')
36
+ text_node_contents = [re.sub(r'[\n\s]+', ' ', node.text_content()).strip() for node in text_nodes]
37
+ return '\n'.join(text for text in text_node_contents if text != '')
38
+
39
+
40
+ def fetch_recent_articles(api: NytApi):
41
+ return api.newswire.newswire()['results']
@@ -0,0 +1,2 @@
1
+ from nytimes_scraper.comments.scraper import fetch_comments_by_article, fetch_comments
2
+ from nytimes_scraper.comments.postprocessing import comments_to_df
@@ -0,0 +1,23 @@
1
+ from typing import List, Dict
2
+
3
+ import pandas as pd
4
+
5
+ from nytimes_scraper.comments.util import flatten_replies, remove_reply_references
6
+
7
+
8
+ def comments_to_df(comments: List[Dict]) -> pd.DataFrame:
9
+ all_comments = remove_reply_references(flatten_replies(comments))
10
+ df = pd.json_normalize(all_comments).set_index('commentID')
11
+
12
+ if 'parentID' in df:
13
+ df['parentID'] = df['parentID'].astype(pd.Int64Dtype())
14
+
15
+ for col in ['status', 'commentType']:
16
+ if col in df:
17
+ df[col] = df[col].astype(pd.CategoricalDtype())
18
+
19
+ for col in ['createDate', 'updateDate', 'approveDate']:
20
+ if col in df:
21
+ df[col] = pd.to_datetime(df[col], unit='s')
22
+
23
+ return df
@@ -0,0 +1,77 @@
1
+ from typing import List, Dict, Optional, Tuple
2
+
3
+ from tqdm import tqdm
4
+
5
+ from nytimes_scraper.comments.util import flatten_replies
6
+ from nytimes_scraper.nyt_api.api import NytApi
7
+
8
+
9
+ def fetch_comments(api: NytApi, article_ids_and_urls: List[Tuple[str, str]], show_progess: bool = True, pagination_size: int = 100) -> List[Dict]:
10
+ """Fetch all comments from multiple articles, given a list of article IDs and URLs
11
+ `[(id_1, url_1), (id_2, url_2), …]`
12
+
13
+ The IDs are not used for processing but are added to the comment objects as an attribute for
14
+ later reference."""
15
+
16
+ comments = []
17
+ for article_id, article_url in tqdm(article_ids_and_urls, unit='Article', disable=not show_progess):
18
+ comments.extend(fetch_comments_by_article(api, article_url, article_id=article_id, pagination_size=pagination_size))
19
+
20
+ return comments
21
+
22
+
23
+ def fetch_comments_by_article(api: NytApi, article_url: str, article_id: Optional[str] = None, pagination_size: int = 100) -> List[Dict]:
24
+ """Fetch all comments from one specific article"""
25
+
26
+ comments = fetch_top_level_comments(api, article_url, pagination_size=pagination_size)
27
+ fetch_replies(api, article_url, comments, pagination_size=pagination_size)
28
+
29
+ if article_id is not None:
30
+ for comment in flatten_replies(comments):
31
+ comment['articleID'] = article_id
32
+
33
+ return comments
34
+
35
+
36
+ def fetch_top_level_comments(api: NytApi, article_url: str, pagination_size: int) -> List[Dict]:
37
+ """Fetch all top level comments by paginating through the comment list.
38
+ Might also include some replies."""
39
+
40
+ comments = []
41
+ while True:
42
+ response = api.community.get_comments(article_url, offset=len(comments), limit=pagination_size)
43
+ if response['status'] != 'OK':
44
+ # some multimedia articles dont allow comments and instead throw an error here
45
+ return []
46
+
47
+ results = response['results']
48
+ new_comments = results['comments']
49
+ comments.extend(new_comments)
50
+
51
+ if len(new_comments) < pagination_size or len(comments) >= results['totalParentCommentsFound']:
52
+ return comments
53
+
54
+
55
+ def fetch_replies(api: NytApi, article_url: str, comments: List[Dict], pagination_size: int):
56
+ """Fetch all replies for every comment.
57
+ Modifies the comment objects by extending the reply lists."""
58
+
59
+ comment_reply_queue = flatten_replies(comments)
60
+ while len(comment_reply_queue) > 0:
61
+ comment = comment_reply_queue.pop()
62
+
63
+ while len(comment['replies']) < comment['replyCount']:
64
+ response = api.community.get_replies(
65
+ article_url=article_url,
66
+ comment_sequence=comment['commentSequence'],
67
+ offset=len(comment['replies']),
68
+ )
69
+ results = response['results']
70
+
71
+ replies = results['comments'][0]['replies']
72
+ comment['replies'].extend(replies)
73
+
74
+ comment_reply_queue.extend(flatten_replies(replies))
75
+
76
+ if len(replies) < pagination_size:
77
+ break
@@ -0,0 +1,24 @@
1
+ from typing import List, Dict
2
+
3
+
4
+ def flatten_replies(comments: List[Dict]) -> List[Dict]:
5
+ """Flattens all comments and replies into one list without copying or modifying the objects"""
6
+
7
+ result = []
8
+ for comment in comments:
9
+ result.append(comment)
10
+ result.extend(flatten_replies(comment['replies']))
11
+
12
+ return result
13
+
14
+
15
+ def remove_reply_references(comments: List[Dict]) -> List[Dict]:
16
+ """Removes the `replies` list from every comments"""
17
+
18
+ result = []
19
+ for comment in comments:
20
+ copy = comment.copy()
21
+ copy.pop('replies', None)
22
+ result.append(copy)
23
+
24
+ return result
@@ -0,0 +1 @@
1
+ from nytimes_scraper.nyt_api.api import NytApi
@@ -0,0 +1,10 @@
1
+ from nytimes_scraper.nyt_api.archive import ArchiveApi
2
+ from nytimes_scraper.nyt_api.community import CommunityApi
3
+ from nytimes_scraper.nyt_api.newswire import NewswireApi
4
+
5
+
6
+ class NytApi:
7
+ def __init__(self, api_key: str):
8
+ self.archive = ArchiveApi(api_key)
9
+ self.community = CommunityApi(api_key)
10
+ self.newswire = NewswireApi(api_key)
@@ -0,0 +1,19 @@
1
+ from time import sleep
2
+
3
+ import requests
4
+
5
+
6
+ class ArchiveApi:
7
+ def __init__(self, api_key: str):
8
+ self.api_key = api_key
9
+
10
+ def archive(self, year: int, month: int):
11
+ response = requests.get(
12
+ url=f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json',
13
+ params={
14
+ 'api-key': self.api_key,
15
+ },
16
+ )
17
+
18
+ sleep(1)
19
+ return response.json()
@@ -0,0 +1,38 @@
1
+ from time import sleep
2
+
3
+ import requests
4
+
5
+
6
+ class CommunityApi:
7
+ def __init__(self, api_key: str):
8
+ self.api_key = api_key
9
+
10
+ def get_comments(self, article_url: str, limit: int = 100, offset: int = 0, sort: str = 'oldest'):
11
+ response = requests.get(
12
+ url='https://www.nytimes.com/svc/community/V3/requestHandler',
13
+ params={
14
+ 'cmd': 'GetCommentsAll',
15
+ 'url': article_url,
16
+ 'sort': sort,
17
+ 'limit': limit,
18
+ 'offset': offset,
19
+ },
20
+ )
21
+
22
+ sleep(1)
23
+ return response.json()
24
+
25
+ def get_replies(self, article_url: str, comment_sequence: int, limit: int = 100, offset: int = 0):
26
+ response = requests.get(
27
+ url='https://www.nytimes.com/svc/community/V3/requestHandler',
28
+ params={
29
+ 'cmd': 'GetRepliesBySequence',
30
+ 'url': article_url,
31
+ 'commentSequence': comment_sequence,
32
+ 'limit': limit,
33
+ 'offset': offset,
34
+ },
35
+ )
36
+
37
+ sleep(1)
38
+ return response.json()
@@ -0,0 +1,19 @@
1
+ from time import sleep
2
+
3
+ import requests
4
+
5
+
6
+ class NewswireApi:
7
+ def __init__(self, api_key: str):
8
+ self.api_key = api_key
9
+
10
+ def newswire(self, source: str = "nyt", section: str = "world"):
11
+ response = requests.get(
12
+ url=f'https://api.nytimes.com/svc/news/v3/content/{source}/{section}.json',
13
+ params={
14
+ 'api-key': self.api_key,
15
+ },
16
+ )
17
+
18
+ sleep(1)
19
+ return response.json()
@@ -0,0 +1,65 @@
1
+ import datetime as dt
2
+ from pathlib import Path
3
+ from typing import Tuple, Callable
4
+
5
+ import pandas as pd
6
+
7
+ from nytimes_scraper.articles import fetch_articles_by_month, articles_to_df
8
+ from nytimes_scraper.comments import fetch_comments, comments_to_df
9
+ from nytimes_scraper.nyt_api import NytApi
10
+
11
+
12
+ out_dir = Path.cwd()
13
+
14
+
15
+ def run_scraper(api_key: str):
16
+ """Scrape articles and comments month by month, starting from the current month"""
17
+
18
+ # the year and month to be fetched
19
+ date = dt.datetime.now().date().replace(day=1)
20
+ while True:
21
+ scrape_month(api_key, date)
22
+
23
+ # go one month back
24
+ date = (date - dt.timedelta(days=1)).replace(day=1)
25
+
26
+
27
+ def scrape_month(api_key: str, date: dt.date, force_fetch: bool = False, store: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
28
+ """Scrape articles and comments for a given month. The `date.day` is ignored."""
29
+
30
+ api = NytApi(api_key)
31
+
32
+ article_df = cached(
33
+ fetch=lambda: articles_to_df(fetch_articles_by_month(api, date)),
34
+ file=out_file(date, 'articles'),
35
+ force_fetch=force_fetch,
36
+ store=store,
37
+ )
38
+
39
+ article_ids_and_urls = list(article_df['web_url'].iteritems())
40
+ comment_df = cached(
41
+ fetch=lambda: comments_to_df(fetch_comments(api, article_ids_and_urls)),
42
+ file=out_file(date, 'comments'),
43
+ force_fetch=force_fetch,
44
+ store=store,
45
+ )
46
+
47
+ return article_df, comment_df
48
+
49
+
50
+ def cached(file: Path, fetch: Callable[[], pd.DataFrame], force_fetch: bool, store: bool) -> pd.DataFrame:
51
+ if not file.exists() or force_fetch:
52
+ df = fetch()
53
+
54
+ if store:
55
+ file.parent.mkdir(exist_ok=True)
56
+ df.to_pickle(str(file))
57
+
58
+ return df
59
+ else:
60
+ return pd.read_pickle(str(file))
61
+
62
+
63
+ def out_file(date: dt.date, kind: str) -> Path:
64
+ return out_dir / f'{date.year}-{date.month:02d}-{kind}.pickle'
65
+
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: nytimes-scraper-fork
3
+ Version: 1.1.2.dev1
4
+ Summary: Scrape article metadata and comments from NYTimes
5
+ Home-page: https://github.com/ietz/nytimes-scraper
6
+ Author: Tim Pietz
7
+ Author-email: tim@pietz.me
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: cssselect
14
+ Requires-Dist: fire
15
+ Requires-Dist: lxml
16
+ Requires-Dist: pandas
17
+ Requires-Dist: requests
18
+ Requires-Dist: tqdm
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description
23
+ Dynamic: description-content-type
24
+ Dynamic: home-page
25
+ Dynamic: license
26
+ Dynamic: requires-dist
27
+ Dynamic: summary
28
+
29
+ # nytimes-scraper
30
+
31
+ [![PyPI](https://img.shields.io/pypi/v/nytimes-scraper)](https://pypi.org/project/nytimes-scraper/)
32
+
33
+ Scrape article metadata and comments from NYTimes
34
+
35
+ ## Setup
36
+ ```bash
37
+ pip install nytimes-scraper
38
+ ```
39
+
40
+ ## CLI usage
41
+ The scraper will automatically fetch every article and all the user comments published on
42
+ [nytimes.com](https://www.nytimes.com/).
43
+ Articles are processed month by month, starting with the current month.
44
+ For each month, a `{year}-{month}-articles.pickle` and `{year}-{month}-comments.pickle` will be
45
+ generated in the current directory.
46
+ If the process is restarted, existing outputs will not be overridden and the scraper will continue
47
+ at the month where it left off.
48
+ To use it, run
49
+ ```bash
50
+ python -m nytimes_scraper <API_KEY>
51
+ ```
52
+
53
+ ## Programmatic usage
54
+ The scraper can also be started programmatically
55
+ ```python
56
+ import datetime as dt
57
+ from nytimes_scraper import run_scraper, scrape_month
58
+
59
+ # scrape february of 2020
60
+ article_df, comment_df = scrape_month('<your_api_key>', date=dt.date(2020, 2, 1))
61
+
62
+ # scrape all articles month by month
63
+ run_scraper('<your_api_key>')
64
+ ```
65
+
66
+ Alternatively, the `nytimes_scraper.articles` and `nytimes_scraper.comments` modules can be used for more
67
+ fine-grained access:
68
+ ```python
69
+ import datetime as dt
70
+ from nytimes_scraper.nyt_api import NytApi
71
+ from nytimes_scraper.articles import fetch_articles_by_month, articles_to_df
72
+ from nytimes_scraper.comments import fetch_comments, fetch_comments_by_article, comments_to_df
73
+
74
+ api = NytApi('<your_api_key>')
75
+
76
+ # Fetch articles of a specific month
77
+ articles = fetch_articles_by_month(api, dt.date(2020, 2, 1))
78
+ article_df = articles_to_df(articles)
79
+
80
+ # Fetch comments from multiple articles
81
+ # a) using the results of a previous article query
82
+ article_ids_and_urls = list(article_df['web_url'].iteritems())
83
+ comments_a = fetch_comments(api, article_ids_and_urls)
84
+ comment_df = comments_to_df(comments_a)
85
+
86
+ # b) using a custom list of articles
87
+ comments_b = fetch_comments(api, article_ids_and_urls=[
88
+ ('nyt://article/316ef65c-7021-5755-885c-a9e1ef2cfdf2', 'https://www.nytimes.com/2020/01/03/world/middleeast/trump-iran-suleimani.html'),
89
+ ('nyt://article/b2d1b802-412e-51f7-8864-efc931e87bb3', 'https://www.nytimes.com/2020/01/04/opinion/impeachment-witnesses.html'),
90
+ ])
91
+
92
+ # Fetch comment for one specific article by its URL
93
+ comments_c = fetch_comments_by_article(api, 'https://www.nytimes.com/2019/11/30/opinion/sunday/bernie-sanders.html')
94
+ ```
@@ -0,0 +1,22 @@
1
+ README.md
2
+ setup.py
3
+ nytimes_scraper/__init__.py
4
+ nytimes_scraper/__main__.py
5
+ nytimes_scraper/scraper.py
6
+ nytimes_scraper/articles/__init__.py
7
+ nytimes_scraper/articles/postprocessing.py
8
+ nytimes_scraper/articles/scraper.py
9
+ nytimes_scraper/comments/__init__.py
10
+ nytimes_scraper/comments/postprocessing.py
11
+ nytimes_scraper/comments/scraper.py
12
+ nytimes_scraper/comments/util.py
13
+ nytimes_scraper/nyt_api/__init__.py
14
+ nytimes_scraper/nyt_api/api.py
15
+ nytimes_scraper/nyt_api/archive.py
16
+ nytimes_scraper/nyt_api/community.py
17
+ nytimes_scraper/nyt_api/newswire.py
18
+ nytimes_scraper_fork.egg-info/PKG-INFO
19
+ nytimes_scraper_fork.egg-info/SOURCES.txt
20
+ nytimes_scraper_fork.egg-info/dependency_links.txt
21
+ nytimes_scraper_fork.egg-info/requires.txt
22
+ nytimes_scraper_fork.egg-info/top_level.txt
@@ -0,0 +1,6 @@
1
+ cssselect
2
+ fire
3
+ lxml
4
+ pandas
5
+ requests
6
+ tqdm
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,36 @@
1
+ from pathlib import Path
2
+ from setuptools import setup, find_packages
3
+
4
+ # The directory containing this file
5
+ here = Path(__file__).parent
6
+
7
+ # The text of the README file
8
+ readme = (here / 'README.md').read_text()
9
+
10
+ # This call to setup() does all the work
11
+ setup(
12
+ name='nytimes-scraper-fork',
13
+ version='1.1.2.dev1',
14
+ description='Scrape article metadata and comments from NYTimes',
15
+ long_description=readme,
16
+ long_description_content_type='text/markdown',
17
+ url='https://github.com/ietz/nytimes-scraper',
18
+ author='Tim Pietz',
19
+ author_email='tim@pietz.me',
20
+ license='MIT',
21
+ classifiers=[
22
+ 'License :: OSI Approved :: MIT License',
23
+ 'Programming Language :: Python :: 3',
24
+ 'Programming Language :: Python :: 3.8',
25
+ ],
26
+ packages=find_packages(),
27
+ include_package_data=True,
28
+ install_requires=[
29
+ 'cssselect',
30
+ 'fire',
31
+ 'lxml',
32
+ 'pandas',
33
+ 'requests',
34
+ 'tqdm',
35
+ ],
36
+ )