google-img-scrap 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +5 -0
- package/README.md +32 -5
- package/package.json +1 -1
- package/src/google-img-scrap.js +31 -1
- package/test/test-filter-titles.js +16 -0
- package/test/test-url-match.js +16 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
### v1.0.4
|
|
4
|
+
|
|
5
|
+
- New option ```urlMatch```. You know get image when an url match a string (example: "cdn")
|
|
6
|
+
- New option ```filterByTitles```. Filter images by titles
|
|
7
|
+
|
|
3
8
|
### v1.0.3
|
|
4
9
|
- New option ```execute```. allow you to execute a function to remove "gstatic.com" domains for example
|
|
5
10
|
|
package/README.md
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
# Google-img-scrap v1.0.
|
|
2
|
-
|
|
1
|
+
# Google-img-scrap v1.0.4
|
|
2
|
+
|
|
3
|
+
Scrap images from google image with lot of tools and options.
|
|
3
4
|
|
|
4
5
|
## Update
|
|
5
6
|
|
|
@@ -38,7 +39,9 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('./src/google-img-scrap');
|
|
|
38
39
|
- "excludeDomains" (Array of String) exclude some domains
|
|
39
40
|
- "safeSearch" (Boolean) active safe search or not for nsfw for example
|
|
40
41
|
- "custom" (String) add extra query
|
|
41
|
-
- "
|
|
42
|
+
- "urlMatch" (Array of Array) get image when an url match a string (example: "cdn") | ```example below```
|
|
43
|
+
- "filterByTitles" (Array of Array) filter images by titles | ```example below```
|
|
44
|
+
- "query" (Object) set a query (can be [TYPE, DATE, COLOR, SIZE, LICENCE, EXTENSION]) (use GOOGLE_QUERY items | ```example below```
|
|
42
45
|
|
|
43
46
|
## Result
|
|
44
47
|
|
|
@@ -87,11 +90,11 @@ console.log(GOOGLE_QUERY);
|
|
|
87
90
|
LICENCE: GOOGLE_QUERY.LICENCE.COMMERCIAL_AND_OTHER,
|
|
88
91
|
EXTENSION: GOOGLE_QUERY.EXTENSION.JPG
|
|
89
92
|
},
|
|
90
|
-
domains: ["alamy.com", "istockphoto.com", "vecteezy.com"
|
|
93
|
+
domains: ["alamy.com", "istockphoto.com", "vecteezy.com"],
|
|
91
94
|
excludeWords: ["black", "white"], //If you don't like black and white cats
|
|
92
95
|
custom: "name=content&name2=content2",
|
|
93
96
|
safeSearch: false,
|
|
94
|
-
// excludeDomains: ["
|
|
97
|
+
// excludeDomains: ["istockphoto.com", "alamy.com"]
|
|
95
98
|
});
|
|
96
99
|
|
|
97
100
|
console.log(test, test.result.length);
|
|
@@ -134,6 +137,30 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
|
|
|
134
137
|
})();
|
|
135
138
|
```
|
|
136
139
|
|
|
140
|
+
## How urlMatch and filterByTitles work ?
|
|
141
|
+
|
|
142
|
+
- urlMatch work like filterByTiles
|
|
143
|
+
|
|
144
|
+
```js
|
|
145
|
+
const { GOOGLE_IMG_SCRAP } = require('google-img-scrap');
|
|
146
|
+
|
|
147
|
+
(async function(){
|
|
148
|
+
const test = await GOOGLE_IMG_SCRAP({
|
|
149
|
+
search: "cats",
|
|
150
|
+
//will build something like this "(draw and white) or (albino and white)"
|
|
151
|
+
filterByTitles: [
|
|
152
|
+
["draw", "white"],
|
|
153
|
+
["albino", "white"]
|
|
154
|
+
],
|
|
155
|
+
execute: function(element){
|
|
156
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
157
|
+
}
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
console.log(test, test.result.length);
|
|
161
|
+
})();
|
|
162
|
+
```
|
|
163
|
+
|
|
137
164
|
## Google query
|
|
138
165
|
|
|
139
166
|
```js
|
package/package.json
CHANGED
package/src/google-img-scrap.js
CHANGED
|
@@ -89,8 +89,38 @@ async function GOOGLE_IMG_SCRAP(config = {}){
|
|
|
89
89
|
const EXCLUDE_WORDS = [];
|
|
90
90
|
if(config.excludeWords) config.excludeWords.forEach((word) => EXCLUDE_WORDS.push(`-"${word}"`));
|
|
91
91
|
|
|
92
|
+
//filter by titles
|
|
93
|
+
const FILTER_TITLE = [];
|
|
94
|
+
if(config.filterByTitles) config.filterByTitles.forEach((titleFilter) => {
|
|
95
|
+
|
|
96
|
+
const value = titleFilter.map((title) => {
|
|
97
|
+
return `intitle:"${title}"`;
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
FILTER_TITLE.push(`(${value.join(' AND ')})`);
|
|
101
|
+
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
//url match words
|
|
105
|
+
const URL_MATCH = [];
|
|
106
|
+
if(config.urlMatch) config.urlMatch.forEach((urlMatch) => {
|
|
107
|
+
|
|
108
|
+
const value = urlMatch.map((content) => {
|
|
109
|
+
return `inurl:${content}`;
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
URL_MATCH.push(`(${value.join(' AND ')})`);
|
|
113
|
+
|
|
114
|
+
});
|
|
115
|
+
|
|
92
116
|
//building url
|
|
93
|
-
const SEARCH_TERM = config.search +
|
|
117
|
+
const SEARCH_TERM = config.search +
|
|
118
|
+
" " + URL_MATCH.join(" OR ") +
|
|
119
|
+
" " + FILTER_TITLE.join(" OR ") +
|
|
120
|
+
" " + EXCLUDE_WORDS.join(" ") +
|
|
121
|
+
" " + EXCLUDE_DOMAINS.join(" ") +
|
|
122
|
+
" " + DOMAINS.join(" OR ");
|
|
123
|
+
|
|
94
124
|
const SEARCH = encodeURIComponent(SEARCH_TERM.trim())
|
|
95
125
|
const QUERY = Object.assign(GOOGLE_CONSTANT.forceGoogleImage, {
|
|
96
126
|
[GOOGLE_CONSTANT.queryParam]: Object.values(config.query || {}).join(','),
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
const { GOOGLE_IMG_SCRAP } = require('../src/google-img-scrap');
|
|
2
|
+
|
|
3
|
+
(async function(){
|
|
4
|
+
const test = await GOOGLE_IMG_SCRAP({
|
|
5
|
+
search: "cats",
|
|
6
|
+
filterByTitles: [
|
|
7
|
+
["draw", "white"],
|
|
8
|
+
["albino", "white"]
|
|
9
|
+
],
|
|
10
|
+
execute: function(element){
|
|
11
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
console.log(test, test.result.length);
|
|
16
|
+
})();
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
const { GOOGLE_IMG_SCRAP } = require('../src/google-img-scrap');
|
|
2
|
+
|
|
3
|
+
(async function(){
|
|
4
|
+
const test = await GOOGLE_IMG_SCRAP({
|
|
5
|
+
search: "cats",
|
|
6
|
+
urlMatch: [
|
|
7
|
+
["cdn"],
|
|
8
|
+
["istockphoto"]
|
|
9
|
+
],
|
|
10
|
+
execute: function(element){
|
|
11
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
console.log(test, test.result.length);
|
|
16
|
+
})();
|