nytimes-scraper-fork 1.1.2.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nytimes_scraper_fork-1.1.2.dev1/PKG-INFO +94 -0
- nytimes_scraper_fork-1.1.2.dev1/README.md +66 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/__init__.py +3 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/__main__.py +6 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/articles/__init__.py +2 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/articles/postprocessing.py +16 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/articles/scraper.py +41 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/comments/__init__.py +2 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/comments/postprocessing.py +23 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/comments/scraper.py +77 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/comments/util.py +24 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/__init__.py +1 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/api.py +10 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/archive.py +19 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/community.py +38 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/nyt_api/newswire.py +19 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper/scraper.py +65 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/PKG-INFO +94 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/SOURCES.txt +22 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/dependency_links.txt +1 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/requires.txt +6 -0
- nytimes_scraper_fork-1.1.2.dev1/nytimes_scraper_fork.egg-info/top_level.txt +1 -0
- nytimes_scraper_fork-1.1.2.dev1/setup.cfg +4 -0
- nytimes_scraper_fork-1.1.2.dev1/setup.py +36 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nytimes-scraper-fork
|
|
3
|
+
Version: 1.1.2.dev1
|
|
4
|
+
Summary: Scrape article metadata and comments from NYTimes
|
|
5
|
+
Home-page: https://github.com/ietz/nytimes-scraper
|
|
6
|
+
Author: Tim Pietz
|
|
7
|
+
Author-email: tim@pietz.me
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: cssselect
|
|
14
|
+
Requires-Dist: fire
|
|
15
|
+
Requires-Dist: lxml
|
|
16
|
+
Requires-Dist: pandas
|
|
17
|
+
Requires-Dist: requests
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Dynamic: author
|
|
20
|
+
Dynamic: author-email
|
|
21
|
+
Dynamic: classifier
|
|
22
|
+
Dynamic: description
|
|
23
|
+
Dynamic: description-content-type
|
|
24
|
+
Dynamic: home-page
|
|
25
|
+
Dynamic: license
|
|
26
|
+
Dynamic: requires-dist
|
|
27
|
+
Dynamic: summary
|
|
28
|
+
|
|
29
|
+
# nytimes-scraper
|
|
30
|
+
|
|
31
|
+
[](https://pypi.org/project/nytimes-scraper/)
|
|
32
|
+
|
|
33
|
+
Scrape article metadata and comments from NYTimes
|
|
34
|
+
|
|
35
|
+
## Setup
|
|
36
|
+
```bash
|
|
37
|
+
pip install nytimes-scraper
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## CLI usage
|
|
41
|
+
The scraper will automatically fetch every article and all the user comments published on
|
|
42
|
+
[nytimes.com](https://www.nytimes.com/).
|
|
43
|
+
Articles are processed month by month, starting with the current month.
|
|
44
|
+
For each month, a `{year}-{month}-articles.pickle` and `{year}-{month}-comments.pickle` will be
|
|
45
|
+
generated in the current directory.
|
|
46
|
+
If the process is restarted, existing outputs will not be overridden and the scraper will continue
|
|
47
|
+
at the month where it left off.
|
|
48
|
+
To use it, run
|
|
49
|
+
```bash
|
|
50
|
+
python -m nytimes_scraper <API_KEY>
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Programmatic usage
|
|
54
|
+
The scraper can also be started programmatically
|
|
55
|
+
```python
|
|
56
|
+
import datetime as dt
|
|
57
|
+
from nytimes_scraper import run_scraper, scrape_month
|
|
58
|
+
|
|
59
|
+
# scrape february of 2020
|
|
60
|
+
article_df, comment_df = scrape_month('<your_api_key>', date=dt.date(2020, 2, 1))
|
|
61
|
+
|
|
62
|
+
# scrape all articles month by month
|
|
63
|
+
run_scraper('<your_api_key>')
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Alternatively, the `nytimes_scraper.articles` and `nytimes_scraper.comments` modules can be used for more
|
|
67
|
+
fine-grained access:
|
|
68
|
+
```python
|
|
69
|
+
import datetime as dt
|
|
70
|
+
from nytimes_scraper.nyt_api import NytApi
|
|
71
|
+
from nytimes_scraper.articles import fetch_articles_by_month, articles_to_df
|
|
72
|
+
from nytimes_scraper.comments import fetch_comments, fetch_comments_by_article, comments_to_df
|
|
73
|
+
|
|
74
|
+
api = NytApi('<your_api_key>')
|
|
75
|
+
|
|
76
|
+
# Fetch articles of a specific month
|
|
77
|
+
articles = fetch_articles_by_month(api, dt.date(2020, 2, 1))
|
|
78
|
+
article_df = articles_to_df(articles)
|
|
79
|
+
|
|
80
|
+
# Fetch comments from multiple articles
|
|
81
|
+
# a) using the results of a previous article query
|
|
82
|
+
article_ids_and_urls = list(article_df['web_url'].iteritems())
|
|
83
|
+
comments_a = fetch_comments(api, article_ids_and_urls)
|
|
84
|
+
comment_df = comments_to_df(comments_a)
|
|
85
|
+
|
|
86
|
+
# b) using a custom list of articles
|
|
87
|
+
comments_b = fetch_comments(api, article_ids_and_urls=[
|
|
88
|
+
('nyt://article/316ef65c-7021-5755-885c-a9e1ef2cfdf2', 'https://www.nytimes.com/2020/01/03/world/middleeast/trump-iran-suleimani.html'),
|
|
89
|
+
('nyt://article/b2d1b802-412e-51f7-8864-efc931e87bb3', 'https://www.nytimes.com/2020/01/04/opinion/impeachment-witnesses.html'),
|
|
90
|
+
])
|
|
91
|
+
|
|
92
|
+
# Fetch comment for one specific article by its URL
|
|
93
|
+
comments_c = fetch_comments_by_article(api, 'https://www.nytimes.com/2019/11/30/opinion/sunday/bernie-sanders.html')
|
|
94
|
+
```
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# nytimes-scraper
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/nytimes-scraper/)
|
|
4
|
+
|
|
5
|
+
Scrape article metadata and comments from NYTimes
|
|
6
|
+
|
|
7
|
+
## Setup
|
|
8
|
+
```bash
|
|
9
|
+
pip install nytimes-scraper
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## CLI usage
|
|
13
|
+
The scraper will automatically fetch every article and all the user comments published on
|
|
14
|
+
[nytimes.com](https://www.nytimes.com/).
|
|
15
|
+
Articles are processed month by month, starting with the current month.
|
|
16
|
+
For each month, a `{year}-{month}-articles.pickle` and `{year}-{month}-comments.pickle` will be
|
|
17
|
+
generated in the current directory.
|
|
18
|
+
If the process is restarted, existing outputs will not be overridden and the scraper will continue
|
|
19
|
+
at the month where it left off.
|
|
20
|
+
To use it, run
|
|
21
|
+
```bash
|
|
22
|
+
python -m nytimes_scraper <API_KEY>
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Programmatic usage
|
|
26
|
+
The scraper can also be started programmatically
|
|
27
|
+
```python
|
|
28
|
+
import datetime as dt
|
|
29
|
+
from nytimes_scraper import run_scraper, scrape_month
|
|
30
|
+
|
|
31
|
+
# scrape february of 2020
|
|
32
|
+
article_df, comment_df = scrape_month('<your_api_key>', date=dt.date(2020, 2, 1))
|
|
33
|
+
|
|
34
|
+
# scrape all articles month by month
|
|
35
|
+
run_scraper('<your_api_key>')
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Alternatively, the `nytimes_scraper.articles` and `nytimes_scraper.comments` modules can be used for more
|
|
39
|
+
fine-grained access:
|
|
40
|
+
```python
|
|
41
|
+
import datetime as dt
|
|
42
|
+
from nytimes_scraper.nyt_api import NytApi
|
|
43
|
+
from nytimes_scraper.articles import fetch_articles_by_month, articles_to_df
|
|
44
|
+
from nytimes_scraper.comments import fetch_comments, fetch_comments_by_article, comments_to_df
|
|
45
|
+
|
|
46
|
+
api = NytApi('<your_api_key>')
|
|
47
|
+
|
|
48
|
+
# Fetch articles of a specific month
|
|
49
|
+
articles = fetch_articles_by_month(api, dt.date(2020, 2, 1))
|
|
50
|
+
article_df = articles_to_df(articles)
|
|
51
|
+
|
|
52
|
+
# Fetch comments from multiple articles
|
|
53
|
+
# a) using the results of a previous article query
|
|
54
|
+
article_ids_and_urls = list(article_df['web_url'].iteritems())
|
|
55
|
+
comments_a = fetch_comments(api, article_ids_and_urls)
|
|
56
|
+
comment_df = comments_to_df(comments_a)
|
|
57
|
+
|
|
58
|
+
# b) using a custom list of articles
|
|
59
|
+
comments_b = fetch_comments(api, article_ids_and_urls=[
|
|
60
|
+
('nyt://article/316ef65c-7021-5755-885c-a9e1ef2cfdf2', 'https://www.nytimes.com/2020/01/03/world/middleeast/trump-iran-suleimani.html'),
|
|
61
|
+
('nyt://article/b2d1b802-412e-51f7-8864-efc931e87bb3', 'https://www.nytimes.com/2020/01/04/opinion/impeachment-witnesses.html'),
|
|
62
|
+
])
|
|
63
|
+
|
|
64
|
+
# Fetch comment for one specific article by its URL
|
|
65
|
+
comments_c = fetch_comments_by_article(api, 'https://www.nytimes.com/2019/11/30/opinion/sunday/bernie-sanders.html')
|
|
66
|
+
```
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import List, Dict
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def articles_to_df(articles: List[Dict]) -> pd.DataFrame:
|
|
7
|
+
df = pd.json_normalize(articles).set_index('_id')
|
|
8
|
+
|
|
9
|
+
if 'pub_date' in df:
|
|
10
|
+
df['pub_date'] = pd.to_datetime(df['pub_date'])
|
|
11
|
+
|
|
12
|
+
for col in ['document_type', 'news_desk', 'section_name', 'type_of_material', 'byline.organization', 'subsectoinName']:
|
|
13
|
+
if col in df:
|
|
14
|
+
df[col] = df[col].astype(pd.CategoricalDtype())
|
|
15
|
+
|
|
16
|
+
return df[~df.index.duplicated()]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
|
|
5
|
+
import lxml.html
|
|
6
|
+
import requests
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from nytimes_scraper.nyt_api.api import NytApi
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def fetch_articles_by_month(api: NytApi, date: dt.date, show_progress: bool = True) -> List[Dict]:
|
|
13
|
+
"""Fetch the article metadata for the year-month of `date`"""
|
|
14
|
+
print(f'Fetching articles for {date.year}-{date.month:02d}')
|
|
15
|
+
|
|
16
|
+
articles = api.archive.archive(date.year, date.month)['response']['docs']
|
|
17
|
+
for article in tqdm(articles, unit='Article', disable=not show_progress):
|
|
18
|
+
try:
|
|
19
|
+
article['html'] = fetch_article_html(article['web_url'])
|
|
20
|
+
article['text'] = scrape_article_text(article['html'])
|
|
21
|
+
except ValueError:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
return articles
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def fetch_article_html(article_url: str) -> str:
|
|
28
|
+
response = requests.get(article_url)
|
|
29
|
+
response.raise_for_status()
|
|
30
|
+
return response.text
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def scrape_article_text(article_html: str) -> str:
|
|
34
|
+
doc = lxml.html.fromstring(article_html)
|
|
35
|
+
text_nodes = doc.cssselect('section[name="articleBody"] > .StoryBodyCompanionColumn > :first-child > *')
|
|
36
|
+
text_node_contents = [re.sub(r'[\n\s]+', ' ', node.text_content()).strip() for node in text_nodes]
|
|
37
|
+
return '\n'.join(text for text in text_node_contents if text != '')
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def fetch_recent_articles(api: NytApi):
|
|
41
|
+
return api.newswire.newswire()['results']
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import List, Dict
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from nytimes_scraper.comments.util import flatten_replies, remove_reply_references
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def comments_to_df(comments: List[Dict]) -> pd.DataFrame:
|
|
9
|
+
all_comments = remove_reply_references(flatten_replies(comments))
|
|
10
|
+
df = pd.json_normalize(all_comments).set_index('commentID')
|
|
11
|
+
|
|
12
|
+
if 'parentID' in df:
|
|
13
|
+
df['parentID'] = df['parentID'].astype(pd.Int64Dtype())
|
|
14
|
+
|
|
15
|
+
for col in ['status', 'commentType']:
|
|
16
|
+
if col in df:
|
|
17
|
+
df[col] = df[col].astype(pd.CategoricalDtype())
|
|
18
|
+
|
|
19
|
+
for col in ['createDate', 'updateDate', 'approveDate']:
|
|
20
|
+
if col in df:
|
|
21
|
+
df[col] = pd.to_datetime(df[col], unit='s')
|
|
22
|
+
|
|
23
|
+
return df
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from typing import List, Dict, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
from tqdm import tqdm
|
|
4
|
+
|
|
5
|
+
from nytimes_scraper.comments.util import flatten_replies
|
|
6
|
+
from nytimes_scraper.nyt_api.api import NytApi
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def fetch_comments(api: NytApi, article_ids_and_urls: List[Tuple[str, str]], show_progess: bool = True, pagination_size: int = 100) -> List[Dict]:
|
|
10
|
+
"""Fetch all comments from multiple articles, given a list of article IDs and URLs
|
|
11
|
+
`[(id_1, url_1), (id_2, url_2), …]`
|
|
12
|
+
|
|
13
|
+
The IDs are not used for processing but are added to the comment objects as an attribute for
|
|
14
|
+
later reference."""
|
|
15
|
+
|
|
16
|
+
comments = []
|
|
17
|
+
for article_id, article_url in tqdm(article_ids_and_urls, unit='Article', disable=not show_progess):
|
|
18
|
+
comments.extend(fetch_comments_by_article(api, article_url, article_id=article_id, pagination_size=pagination_size))
|
|
19
|
+
|
|
20
|
+
return comments
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def fetch_comments_by_article(api: NytApi, article_url: str, article_id: Optional[str] = None, pagination_size: int = 100) -> List[Dict]:
|
|
24
|
+
"""Fetch all comments from one specific article"""
|
|
25
|
+
|
|
26
|
+
comments = fetch_top_level_comments(api, article_url, pagination_size=pagination_size)
|
|
27
|
+
fetch_replies(api, article_url, comments, pagination_size=pagination_size)
|
|
28
|
+
|
|
29
|
+
if article_id is not None:
|
|
30
|
+
for comment in flatten_replies(comments):
|
|
31
|
+
comment['articleID'] = article_id
|
|
32
|
+
|
|
33
|
+
return comments
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def fetch_top_level_comments(api: NytApi, article_url: str, pagination_size: int) -> List[Dict]:
|
|
37
|
+
"""Fetch all top level comments by paginating through the comment list.
|
|
38
|
+
Might also include some replies."""
|
|
39
|
+
|
|
40
|
+
comments = []
|
|
41
|
+
while True:
|
|
42
|
+
response = api.community.get_comments(article_url, offset=len(comments), limit=pagination_size)
|
|
43
|
+
if response['status'] != 'OK':
|
|
44
|
+
# some multimedia articles dont allow comments and instead throw an error here
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
results = response['results']
|
|
48
|
+
new_comments = results['comments']
|
|
49
|
+
comments.extend(new_comments)
|
|
50
|
+
|
|
51
|
+
if len(new_comments) < pagination_size or len(comments) >= results['totalParentCommentsFound']:
|
|
52
|
+
return comments
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def fetch_replies(api: NytApi, article_url: str, comments: List[Dict], pagination_size: int):
|
|
56
|
+
"""Fetch all replies for every comment.
|
|
57
|
+
Modifies the comment objects by extending the reply lists."""
|
|
58
|
+
|
|
59
|
+
comment_reply_queue = flatten_replies(comments)
|
|
60
|
+
while len(comment_reply_queue) > 0:
|
|
61
|
+
comment = comment_reply_queue.pop()
|
|
62
|
+
|
|
63
|
+
while len(comment['replies']) < comment['replyCount']:
|
|
64
|
+
response = api.community.get_replies(
|
|
65
|
+
article_url=article_url,
|
|
66
|
+
comment_sequence=comment['commentSequence'],
|
|
67
|
+
offset=len(comment['replies']),
|
|
68
|
+
)
|
|
69
|
+
results = response['results']
|
|
70
|
+
|
|
71
|
+
replies = results['comments'][0]['replies']
|
|
72
|
+
comment['replies'].extend(replies)
|
|
73
|
+
|
|
74
|
+
comment_reply_queue.extend(flatten_replies(replies))
|
|
75
|
+
|
|
76
|
+
if len(replies) < pagination_size:
|
|
77
|
+
break
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from typing import List, Dict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def flatten_replies(comments: List[Dict]) -> List[Dict]:
|
|
5
|
+
"""Flattens all comments and replies into one list without copying or modifying the objects"""
|
|
6
|
+
|
|
7
|
+
result = []
|
|
8
|
+
for comment in comments:
|
|
9
|
+
result.append(comment)
|
|
10
|
+
result.extend(flatten_replies(comment['replies']))
|
|
11
|
+
|
|
12
|
+
return result
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def remove_reply_references(comments: List[Dict]) -> List[Dict]:
|
|
16
|
+
"""Removes the `replies` list from every comments"""
|
|
17
|
+
|
|
18
|
+
result = []
|
|
19
|
+
for comment in comments:
|
|
20
|
+
copy = comment.copy()
|
|
21
|
+
copy.pop('replies', None)
|
|
22
|
+
result.append(copy)
|
|
23
|
+
|
|
24
|
+
return result
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from nytimes_scraper.nyt_api.api import NytApi
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from nytimes_scraper.nyt_api.archive import ArchiveApi
|
|
2
|
+
from nytimes_scraper.nyt_api.community import CommunityApi
|
|
3
|
+
from nytimes_scraper.nyt_api.newswire import NewswireApi
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NytApi:
|
|
7
|
+
def __init__(self, api_key: str):
|
|
8
|
+
self.archive = ArchiveApi(api_key)
|
|
9
|
+
self.community = CommunityApi(api_key)
|
|
10
|
+
self.newswire = NewswireApi(api_key)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from time import sleep
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ArchiveApi:
|
|
7
|
+
def __init__(self, api_key: str):
|
|
8
|
+
self.api_key = api_key
|
|
9
|
+
|
|
10
|
+
def archive(self, year: int, month: int):
|
|
11
|
+
response = requests.get(
|
|
12
|
+
url=f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json',
|
|
13
|
+
params={
|
|
14
|
+
'api-key': self.api_key,
|
|
15
|
+
},
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
sleep(1)
|
|
19
|
+
return response.json()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from time import sleep
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CommunityApi:
|
|
7
|
+
def __init__(self, api_key: str):
|
|
8
|
+
self.api_key = api_key
|
|
9
|
+
|
|
10
|
+
def get_comments(self, article_url: str, limit: int = 100, offset: int = 0, sort: str = 'oldest'):
|
|
11
|
+
response = requests.get(
|
|
12
|
+
url='https://www.nytimes.com/svc/community/V3/requestHandler',
|
|
13
|
+
params={
|
|
14
|
+
'cmd': 'GetCommentsAll',
|
|
15
|
+
'url': article_url,
|
|
16
|
+
'sort': sort,
|
|
17
|
+
'limit': limit,
|
|
18
|
+
'offset': offset,
|
|
19
|
+
},
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
sleep(1)
|
|
23
|
+
return response.json()
|
|
24
|
+
|
|
25
|
+
def get_replies(self, article_url: str, comment_sequence: int, limit: int = 100, offset: int = 0):
|
|
26
|
+
response = requests.get(
|
|
27
|
+
url='https://www.nytimes.com/svc/community/V3/requestHandler',
|
|
28
|
+
params={
|
|
29
|
+
'cmd': 'GetRepliesBySequence',
|
|
30
|
+
'url': article_url,
|
|
31
|
+
'commentSequence': comment_sequence,
|
|
32
|
+
'limit': limit,
|
|
33
|
+
'offset': offset,
|
|
34
|
+
},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
sleep(1)
|
|
38
|
+
return response.json()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from time import sleep
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NewswireApi:
|
|
7
|
+
def __init__(self, api_key: str):
|
|
8
|
+
self.api_key = api_key
|
|
9
|
+
|
|
10
|
+
def newswire(self, source: str = "nyt", section: str = "world"):
|
|
11
|
+
response = requests.get(
|
|
12
|
+
url=f'https://api.nytimes.com/svc/news/v3/content/{source}/{section}.json',
|
|
13
|
+
params={
|
|
14
|
+
'api-key': self.api_key,
|
|
15
|
+
},
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
sleep(1)
|
|
19
|
+
return response.json()
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Tuple, Callable
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from nytimes_scraper.articles import fetch_articles_by_month, articles_to_df
|
|
8
|
+
from nytimes_scraper.comments import fetch_comments, comments_to_df
|
|
9
|
+
from nytimes_scraper.nyt_api import NytApi
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
out_dir = Path.cwd()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def run_scraper(api_key: str):
|
|
16
|
+
"""Scrape articles and comments month by month, starting from the current month"""
|
|
17
|
+
|
|
18
|
+
# the year and month to be fetched
|
|
19
|
+
date = dt.datetime.now().date().replace(day=1)
|
|
20
|
+
while True:
|
|
21
|
+
scrape_month(api_key, date)
|
|
22
|
+
|
|
23
|
+
# go one month back
|
|
24
|
+
date = (date - dt.timedelta(days=1)).replace(day=1)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def scrape_month(api_key: str, date: dt.date, force_fetch: bool = False, store: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
28
|
+
"""Scrape articles and comments for a given month. The `date.day` is ignored."""
|
|
29
|
+
|
|
30
|
+
api = NytApi(api_key)
|
|
31
|
+
|
|
32
|
+
article_df = cached(
|
|
33
|
+
fetch=lambda: articles_to_df(fetch_articles_by_month(api, date)),
|
|
34
|
+
file=out_file(date, 'articles'),
|
|
35
|
+
force_fetch=force_fetch,
|
|
36
|
+
store=store,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
article_ids_and_urls = list(article_df['web_url'].iteritems())
|
|
40
|
+
comment_df = cached(
|
|
41
|
+
fetch=lambda: comments_to_df(fetch_comments(api, article_ids_and_urls)),
|
|
42
|
+
file=out_file(date, 'comments'),
|
|
43
|
+
force_fetch=force_fetch,
|
|
44
|
+
store=store,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return article_df, comment_df
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def cached(file: Path, fetch: Callable[[], pd.DataFrame], force_fetch: bool, store: bool) -> pd.DataFrame:
|
|
51
|
+
if not file.exists() or force_fetch:
|
|
52
|
+
df = fetch()
|
|
53
|
+
|
|
54
|
+
if store:
|
|
55
|
+
file.parent.mkdir(exist_ok=True)
|
|
56
|
+
df.to_pickle(str(file))
|
|
57
|
+
|
|
58
|
+
return df
|
|
59
|
+
else:
|
|
60
|
+
return pd.read_pickle(str(file))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def out_file(date: dt.date, kind: str) -> Path:
|
|
64
|
+
return out_dir / f'{date.year}-{date.month:02d}-{kind}.pickle'
|
|
65
|
+
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nytimes-scraper-fork
|
|
3
|
+
Version: 1.1.2.dev1
|
|
4
|
+
Summary: Scrape article metadata and comments from NYTimes
|
|
5
|
+
Home-page: https://github.com/ietz/nytimes-scraper
|
|
6
|
+
Author: Tim Pietz
|
|
7
|
+
Author-email: tim@pietz.me
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: cssselect
|
|
14
|
+
Requires-Dist: fire
|
|
15
|
+
Requires-Dist: lxml
|
|
16
|
+
Requires-Dist: pandas
|
|
17
|
+
Requires-Dist: requests
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Dynamic: author
|
|
20
|
+
Dynamic: author-email
|
|
21
|
+
Dynamic: classifier
|
|
22
|
+
Dynamic: description
|
|
23
|
+
Dynamic: description-content-type
|
|
24
|
+
Dynamic: home-page
|
|
25
|
+
Dynamic: license
|
|
26
|
+
Dynamic: requires-dist
|
|
27
|
+
Dynamic: summary
|
|
28
|
+
|
|
29
|
+
# nytimes-scraper
|
|
30
|
+
|
|
31
|
+
[](https://pypi.org/project/nytimes-scraper/)
|
|
32
|
+
|
|
33
|
+
Scrape article metadata and comments from NYTimes
|
|
34
|
+
|
|
35
|
+
## Setup
|
|
36
|
+
```bash
|
|
37
|
+
pip install nytimes-scraper
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## CLI usage
|
|
41
|
+
The scraper will automatically fetch every article and all the user comments published on
|
|
42
|
+
[nytimes.com](https://www.nytimes.com/).
|
|
43
|
+
Articles are processed month by month, starting with the current month.
|
|
44
|
+
For each month, a `{year}-{month}-articles.pickle` and `{year}-{month}-comments.pickle` will be
|
|
45
|
+
generated in the current directory.
|
|
46
|
+
If the process is restarted, existing outputs will not be overridden and the scraper will continue
|
|
47
|
+
at the month where it left off.
|
|
48
|
+
To use it, run
|
|
49
|
+
```bash
|
|
50
|
+
python -m nytimes_scraper <API_KEY>
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Programmatic usage
|
|
54
|
+
The scraper can also be started programmatically
|
|
55
|
+
```python
|
|
56
|
+
import datetime as dt
|
|
57
|
+
from nytimes_scraper import run_scraper, scrape_month
|
|
58
|
+
|
|
59
|
+
# scrape february of 2020
|
|
60
|
+
article_df, comment_df = scrape_month('<your_api_key>', date=dt.date(2020, 2, 1))
|
|
61
|
+
|
|
62
|
+
# scrape all articles month by month
|
|
63
|
+
run_scraper('<your_api_key>')
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Alternatively, the `nytimes_scraper.articles` and `nytimes_scraper.comments` modules can be used for more
|
|
67
|
+
fine-grained access:
|
|
68
|
+
```python
|
|
69
|
+
import datetime as dt
|
|
70
|
+
from nytimes_scraper.nyt_api import NytApi
|
|
71
|
+
from nytimes_scraper.articles import fetch_articles_by_month, articles_to_df
|
|
72
|
+
from nytimes_scraper.comments import fetch_comments, fetch_comments_by_article, comments_to_df
|
|
73
|
+
|
|
74
|
+
api = NytApi('<your_api_key>')
|
|
75
|
+
|
|
76
|
+
# Fetch articles of a specific month
|
|
77
|
+
articles = fetch_articles_by_month(api, dt.date(2020, 2, 1))
|
|
78
|
+
article_df = articles_to_df(articles)
|
|
79
|
+
|
|
80
|
+
# Fetch comments from multiple articles
|
|
81
|
+
# a) using the results of a previous article query
|
|
82
|
+
article_ids_and_urls = list(article_df['web_url'].iteritems())
|
|
83
|
+
comments_a = fetch_comments(api, article_ids_and_urls)
|
|
84
|
+
comment_df = comments_to_df(comments_a)
|
|
85
|
+
|
|
86
|
+
# b) using a custom list of articles
|
|
87
|
+
comments_b = fetch_comments(api, article_ids_and_urls=[
|
|
88
|
+
('nyt://article/316ef65c-7021-5755-885c-a9e1ef2cfdf2', 'https://www.nytimes.com/2020/01/03/world/middleeast/trump-iran-suleimani.html'),
|
|
89
|
+
('nyt://article/b2d1b802-412e-51f7-8864-efc931e87bb3', 'https://www.nytimes.com/2020/01/04/opinion/impeachment-witnesses.html'),
|
|
90
|
+
])
|
|
91
|
+
|
|
92
|
+
# Fetch comment for one specific article by its URL
|
|
93
|
+
comments_c = fetch_comments_by_article(api, 'https://www.nytimes.com/2019/11/30/opinion/sunday/bernie-sanders.html')
|
|
94
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
nytimes_scraper/__init__.py
|
|
4
|
+
nytimes_scraper/__main__.py
|
|
5
|
+
nytimes_scraper/scraper.py
|
|
6
|
+
nytimes_scraper/articles/__init__.py
|
|
7
|
+
nytimes_scraper/articles/postprocessing.py
|
|
8
|
+
nytimes_scraper/articles/scraper.py
|
|
9
|
+
nytimes_scraper/comments/__init__.py
|
|
10
|
+
nytimes_scraper/comments/postprocessing.py
|
|
11
|
+
nytimes_scraper/comments/scraper.py
|
|
12
|
+
nytimes_scraper/comments/util.py
|
|
13
|
+
nytimes_scraper/nyt_api/__init__.py
|
|
14
|
+
nytimes_scraper/nyt_api/api.py
|
|
15
|
+
nytimes_scraper/nyt_api/archive.py
|
|
16
|
+
nytimes_scraper/nyt_api/community.py
|
|
17
|
+
nytimes_scraper/nyt_api/newswire.py
|
|
18
|
+
nytimes_scraper_fork.egg-info/PKG-INFO
|
|
19
|
+
nytimes_scraper_fork.egg-info/SOURCES.txt
|
|
20
|
+
nytimes_scraper_fork.egg-info/dependency_links.txt
|
|
21
|
+
nytimes_scraper_fork.egg-info/requires.txt
|
|
22
|
+
nytimes_scraper_fork.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nytimes_scraper
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from setuptools import setup, find_packages
|
|
3
|
+
|
|
4
|
+
# The directory containing this file
|
|
5
|
+
here = Path(__file__).parent
|
|
6
|
+
|
|
7
|
+
# The text of the README file
|
|
8
|
+
readme = (here / 'README.md').read_text()
|
|
9
|
+
|
|
10
|
+
# This call to setup() does all the work
|
|
11
|
+
setup(
|
|
12
|
+
name='nytimes-scraper-fork',
|
|
13
|
+
version='1.1.2.dev1',
|
|
14
|
+
description='Scrape article metadata and comments from NYTimes',
|
|
15
|
+
long_description=readme,
|
|
16
|
+
long_description_content_type='text/markdown',
|
|
17
|
+
url='https://github.com/ietz/nytimes-scraper',
|
|
18
|
+
author='Tim Pietz',
|
|
19
|
+
author_email='tim@pietz.me',
|
|
20
|
+
license='MIT',
|
|
21
|
+
classifiers=[
|
|
22
|
+
'License :: OSI Approved :: MIT License',
|
|
23
|
+
'Programming Language :: Python :: 3',
|
|
24
|
+
'Programming Language :: Python :: 3.8',
|
|
25
|
+
],
|
|
26
|
+
packages=find_packages(),
|
|
27
|
+
include_package_data=True,
|
|
28
|
+
install_requires=[
|
|
29
|
+
'cssselect',
|
|
30
|
+
'fire',
|
|
31
|
+
'lxml',
|
|
32
|
+
'pandas',
|
|
33
|
+
'requests',
|
|
34
|
+
'tqdm',
|
|
35
|
+
],
|
|
36
|
+
)
|