google-img-scrap 1.0.0 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/README.md +100 -19
- package/package.json +2 -1
- package/src/google-img-scrap.js +49 -3
- package/test/test-filter-titles.js +16 -0
- package/test/test-url-match.js +16 -0
- package/test/test-wallpaper.js +17 -0
- package/test/test.js +7 -3
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
### v1.0.4
|
|
4
|
+
|
|
5
|
+
- New option ```urlMatch```. You know get image when an url match a string (example: "cdn")
|
|
6
|
+
- New option ```filterByTitles```. Filter images by titles
|
|
7
|
+
|
|
8
|
+
### v1.0.3
|
|
9
|
+
- New option ```execute```. allow you to execute a function to remove "gstatic.com" domains for example
|
|
10
|
+
|
|
11
|
+
### v1.0.2
|
|
12
|
+
- Cannot set 'domains' and 'excludeDomains' as same time
|
|
13
|
+
- Fixed some bugs
|
|
14
|
+
- New option ```excludeWords```
|
|
15
|
+
|
|
16
|
+
### v1.0.1
|
|
17
|
+
- Added the missing dependencie
|
package/README.md
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
|
-
# Google-img-scrap v1.0.
|
|
2
|
-
|
|
1
|
+
# Google-img-scrap v1.0.4
|
|
2
|
+
|
|
3
|
+
Scrap images from google image with lot of tools and options.
|
|
4
|
+
|
|
5
|
+
## Update
|
|
6
|
+
|
|
7
|
+
- See [changelog](CHANGELOG.md)
|
|
3
8
|
|
|
4
9
|
## Found a bug ?
|
|
5
10
|
|
|
@@ -13,21 +18,56 @@ npm i google-img-scrap
|
|
|
13
18
|
|
|
14
19
|
## Import
|
|
15
20
|
|
|
21
|
+
- NPM
|
|
22
|
+
|
|
23
|
+
```js
|
|
24
|
+
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
- From GITHUB
|
|
28
|
+
|
|
16
29
|
```js
|
|
17
|
-
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('
|
|
30
|
+
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('./src/google-img-scrap');
|
|
18
31
|
```
|
|
19
32
|
|
|
33
|
+
## Params
|
|
34
|
+
|
|
35
|
+
- "search" (String) what you want to search
|
|
36
|
+
- "execute" (Function) allow you to execute a function to remove "gstatic.com" domains for example
|
|
37
|
+
- "excludeWords" (Array of String) exclude some words from the search
|
|
38
|
+
- "domains" (Array of String) filter by domains
|
|
39
|
+
- "excludeDomains" (Array of String) exclude some domains
|
|
40
|
+
- "safeSearch" (Boolean) active safe search or not for nsfw for example
|
|
41
|
+
- "custom" (String) add extra query
|
|
42
|
+
- "urlMatch" (Array of Array) get image when an url match a string (example: "cdn") | ```example below```
|
|
43
|
+
- "filterByTitles" (Array of Array) filter images by titles | ```example below```
|
|
44
|
+
- "query" (Object) set a query (can be [TYPE, DATE, COLOR, SIZE, LICENCE, EXTENSION]) (use GOOGLE_QUERY items | ```example below```
|
|
45
|
+
|
|
20
46
|
## Result
|
|
21
47
|
|
|
22
48
|
```js
|
|
23
|
-
|
|
49
|
+
}
|
|
50
|
+
{
|
|
51
|
+
url: 'https://images.google.com/search?tbm=isch&tbs=itp:clipart,qdr:y,ic:gray,isz:l,il:ol,ift:jpg&q=cats',
|
|
52
|
+
result: [
|
|
53
|
+
{
|
|
54
|
+
url: 'https://media.istockphoto.com/vectors/black-cats-set-vector-id599123506',
|
|
55
|
+
height: '806',
|
|
56
|
+
width: '1024'
|
|
57
|
+
},
|
|
24
58
|
{
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
59
|
+
url: 'https://media.istockphoto.com/vectors/cats-vector-id455327075',
|
|
60
|
+
height: '860',
|
|
61
|
+
width: '1024'
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
url: 'https://media.istockphoto.com/vectors/purring-cats-vector-silhouette-vector-id165749810?s=2048x2048',
|
|
65
|
+
height: '1895',
|
|
66
|
+
width: '2048'
|
|
28
67
|
},
|
|
29
68
|
...
|
|
30
|
-
]
|
|
69
|
+
]
|
|
70
|
+
}
|
|
31
71
|
```
|
|
32
72
|
|
|
33
73
|
## How to use ?
|
|
@@ -35,7 +75,9 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
|
|
|
35
75
|
- For the query parameter you need to set the name in upper case !
|
|
36
76
|
|
|
37
77
|
```js
|
|
38
|
-
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('
|
|
78
|
+
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
|
|
79
|
+
|
|
80
|
+
console.log(GOOGLE_QUERY);
|
|
39
81
|
|
|
40
82
|
(async function(){
|
|
41
83
|
const test = await GOOGLE_IMG_SCRAP({
|
|
@@ -48,10 +90,11 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
|
|
|
48
90
|
LICENCE: GOOGLE_QUERY.LICENCE.COMMERCIAL_AND_OTHER,
|
|
49
91
|
EXTENSION: GOOGLE_QUERY.EXTENSION.JPG
|
|
50
92
|
},
|
|
51
|
-
domains: [],
|
|
93
|
+
domains: ["alamy.com", "istockphoto.com", "vecteezy.com"],
|
|
94
|
+
excludeWords: ["black", "white"], //If you don't like black and white cats
|
|
52
95
|
custom: "name=content&name2=content2",
|
|
53
96
|
safeSearch: false,
|
|
54
|
-
excludeDomains: []
|
|
97
|
+
// excludeDomains: ["istockphoto.com", "alamy.com"]
|
|
55
98
|
});
|
|
56
99
|
|
|
57
100
|
console.log(test, test.result.length);
|
|
@@ -61,7 +104,7 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
|
|
|
61
104
|
OR ALSO
|
|
62
105
|
|
|
63
106
|
```js
|
|
64
|
-
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('
|
|
107
|
+
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
|
|
65
108
|
|
|
66
109
|
(async function(){
|
|
67
110
|
const test = await GOOGLE_IMG_SCRAP({
|
|
@@ -71,14 +114,52 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
|
|
|
71
114
|
console.log(test, test.result.length);
|
|
72
115
|
})();
|
|
73
116
|
```
|
|
74
|
-
## Params
|
|
75
117
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
118
|
+
## Removing gstatic.com
|
|
119
|
+
|
|
120
|
+
```js
|
|
121
|
+
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
|
|
122
|
+
|
|
123
|
+
(async function(){
|
|
124
|
+
const test = await GOOGLE_IMG_SCRAP({
|
|
125
|
+
search: "demon slayer background hd",
|
|
126
|
+
query: {
|
|
127
|
+
SIZE: GOOGLE_QUERY.SIZE.LARGE,
|
|
128
|
+
},
|
|
129
|
+
domains: ["alphacoders.com"],
|
|
130
|
+
safeSearch: false,
|
|
131
|
+
execute: function(element){
|
|
132
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
133
|
+
}
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
console.log(test, test.result[test.result.length-1].url, test.result.length);
|
|
137
|
+
})();
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## How urlMatch and filterByTitles work ?
|
|
141
|
+
|
|
142
|
+
- urlMatch work like filterByTiles
|
|
143
|
+
|
|
144
|
+
```js
|
|
145
|
+
const { GOOGLE_IMG_SCRAP } = require('google-img-scrap');
|
|
146
|
+
|
|
147
|
+
(async function(){
|
|
148
|
+
const test = await GOOGLE_IMG_SCRAP({
|
|
149
|
+
search: "cats",
|
|
150
|
+
//will build something like this "(draw and white) or (albino and white)"
|
|
151
|
+
filterByTitles: [
|
|
152
|
+
["draw", "white"],
|
|
153
|
+
["albino", "white"]
|
|
154
|
+
],
|
|
155
|
+
execute: function(element){
|
|
156
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
157
|
+
}
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
console.log(test, test.result.length);
|
|
161
|
+
})();
|
|
162
|
+
```
|
|
82
163
|
|
|
83
164
|
## Google query
|
|
84
165
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "google-img-scrap",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.4",
|
|
4
4
|
"description": "Scrap images from google images with a lot of options",
|
|
5
5
|
"main": "./src/google-img-scrap.js",
|
|
6
6
|
"directories": {
|
|
@@ -35,6 +35,7 @@
|
|
|
35
35
|
},
|
|
36
36
|
"homepage": "https://github.com/yoannchb-pro/google-img-scrap#readme",
|
|
37
37
|
"dependencies": {
|
|
38
|
+
"fast-html-dom-parser": "^1.0.5",
|
|
38
39
|
"got": "^11.0.0"
|
|
39
40
|
}
|
|
40
41
|
}
|
package/src/google-img-scrap.js
CHANGED
|
@@ -10,6 +10,8 @@ const { buildQuery, unicodeToChar } = require('./utils/UTILS');
|
|
|
10
10
|
|
|
11
11
|
//verify good configuration
|
|
12
12
|
function verify(config){
|
|
13
|
+
if(config.excludeDomains && config.domains) throw "Can not set 'excludeDomains' and 'domains' as same times";
|
|
14
|
+
|
|
13
15
|
if(!config.search || config.search.trim() == "") throw "'search' can not be empty";
|
|
14
16
|
|
|
15
17
|
if(config.query){
|
|
@@ -77,16 +79,52 @@ async function GOOGLE_IMG_SCRAP(config = {}){
|
|
|
77
79
|
|
|
78
80
|
//exclude domains
|
|
79
81
|
const EXCLUDE_DOMAINS = [];
|
|
80
|
-
if(config.excludeDomains) config.excludeDomains.forEach((domain) => EXCLUDE_DOMAINS.push(`-site
|
|
82
|
+
if(config.excludeDomains) config.excludeDomains.forEach((domain) => EXCLUDE_DOMAINS.push(`-site:"${domain}"`));
|
|
81
83
|
|
|
82
84
|
//domains
|
|
83
85
|
const DOMAINS = [];
|
|
84
86
|
if(config.domains) config.domains.forEach((domain) => DOMAINS.push(`site:"${domain}"`));
|
|
85
87
|
|
|
88
|
+
//exclude words
|
|
89
|
+
const EXCLUDE_WORDS = [];
|
|
90
|
+
if(config.excludeWords) config.excludeWords.forEach((word) => EXCLUDE_WORDS.push(`-"${word}"`));
|
|
91
|
+
|
|
92
|
+
//filter by titles
|
|
93
|
+
const FILTER_TITLE = [];
|
|
94
|
+
if(config.filterByTitles) config.filterByTitles.forEach((titleFilter) => {
|
|
95
|
+
|
|
96
|
+
const value = titleFilter.map((title) => {
|
|
97
|
+
return `intitle:"${title}"`;
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
FILTER_TITLE.push(`(${value.join(' AND ')})`);
|
|
101
|
+
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
//url match words
|
|
105
|
+
const URL_MATCH = [];
|
|
106
|
+
if(config.urlMatch) config.urlMatch.forEach((urlMatch) => {
|
|
107
|
+
|
|
108
|
+
const value = urlMatch.map((content) => {
|
|
109
|
+
return `inurl:${content}`;
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
URL_MATCH.push(`(${value.join(' AND ')})`);
|
|
113
|
+
|
|
114
|
+
});
|
|
115
|
+
|
|
86
116
|
//building url
|
|
117
|
+
const SEARCH_TERM = config.search +
|
|
118
|
+
" " + URL_MATCH.join(" OR ") +
|
|
119
|
+
" " + FILTER_TITLE.join(" OR ") +
|
|
120
|
+
" " + EXCLUDE_WORDS.join(" ") +
|
|
121
|
+
" " + EXCLUDE_DOMAINS.join(" ") +
|
|
122
|
+
" " + DOMAINS.join(" OR ");
|
|
123
|
+
|
|
124
|
+
const SEARCH = encodeURIComponent(SEARCH_TERM.trim())
|
|
87
125
|
const QUERY = Object.assign(GOOGLE_CONSTANT.forceGoogleImage, {
|
|
88
126
|
[GOOGLE_CONSTANT.queryParam]: Object.values(config.query || {}).join(','),
|
|
89
|
-
q:
|
|
127
|
+
q: SEARCH,
|
|
90
128
|
});
|
|
91
129
|
|
|
92
130
|
const CUSTOM_PARAM = config.custom ? `&${config.custom}` : "";
|
|
@@ -97,8 +135,16 @@ async function GOOGLE_IMG_SCRAP(config = {}){
|
|
|
97
135
|
//parsing
|
|
98
136
|
const result = await parse(URL);
|
|
99
137
|
|
|
138
|
+
//excute function
|
|
139
|
+
let finalResult = [];
|
|
140
|
+
if(config.execute) result.forEach((element) => {
|
|
141
|
+
const value = config.execute(element);
|
|
142
|
+
if(value) finalResult.push(value);
|
|
143
|
+
});
|
|
144
|
+
else finalResult = result;
|
|
145
|
+
|
|
100
146
|
//result
|
|
101
|
-
return {url: URL, result:
|
|
147
|
+
return {url: URL, result: finalResult};
|
|
102
148
|
};
|
|
103
149
|
|
|
104
150
|
module.exports = { GOOGLE_IMG_SCRAP , GOOGLE_QUERY };
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
const { GOOGLE_IMG_SCRAP } = require('../src/google-img-scrap');
|
|
2
|
+
|
|
3
|
+
(async function(){
|
|
4
|
+
const test = await GOOGLE_IMG_SCRAP({
|
|
5
|
+
search: "cats",
|
|
6
|
+
filterByTitles: [
|
|
7
|
+
["draw", "white"],
|
|
8
|
+
["albino", "white"]
|
|
9
|
+
],
|
|
10
|
+
execute: function(element){
|
|
11
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
console.log(test, test.result.length);
|
|
16
|
+
})();
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
const { GOOGLE_IMG_SCRAP } = require('../src/google-img-scrap');
|
|
2
|
+
|
|
3
|
+
(async function(){
|
|
4
|
+
const test = await GOOGLE_IMG_SCRAP({
|
|
5
|
+
search: "cats",
|
|
6
|
+
urlMatch: [
|
|
7
|
+
["cdn"],
|
|
8
|
+
["istockphoto"]
|
|
9
|
+
],
|
|
10
|
+
execute: function(element){
|
|
11
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
console.log(test, test.result.length);
|
|
16
|
+
})();
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
|
|
2
|
+
|
|
3
|
+
(async function(){
|
|
4
|
+
const test = await GOOGLE_IMG_SCRAP({
|
|
5
|
+
search: "demon slayer background hd",
|
|
6
|
+
query: {
|
|
7
|
+
SIZE: GOOGLE_QUERY.SIZE.LARGE,
|
|
8
|
+
},
|
|
9
|
+
domains: ["alphacoders.com"],
|
|
10
|
+
safeSearch: false,
|
|
11
|
+
execute: function(element){
|
|
12
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
13
|
+
}
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
console.log(test, test.result[test.result.length-1].url, test.result.length);
|
|
17
|
+
})();
|
package/test/test.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
|
|
2
2
|
|
|
3
|
-
console.log(GOOGLE_QUERY);
|
|
3
|
+
// console.log(GOOGLE_QUERY);
|
|
4
4
|
|
|
5
5
|
(async function(){
|
|
6
6
|
const test = await GOOGLE_IMG_SCRAP({
|
|
@@ -13,10 +13,14 @@ console.log(GOOGLE_QUERY);
|
|
|
13
13
|
LICENCE: GOOGLE_QUERY.LICENCE.COMMERCIAL_AND_OTHER,
|
|
14
14
|
EXTENSION: GOOGLE_QUERY.EXTENSION.JPG
|
|
15
15
|
},
|
|
16
|
-
domains: [],
|
|
16
|
+
domains: ["alamy.com", "istockphoto.com", "vecteezy.com"],
|
|
17
|
+
excludeWords: ["black", "white"], //If you don't like black and white cats
|
|
17
18
|
custom: "name=content&name2=content2",
|
|
18
19
|
safeSearch: false,
|
|
19
|
-
|
|
20
|
+
execute: function(element){
|
|
21
|
+
if(!element.url.match('gstatic.com')) return element;
|
|
22
|
+
}
|
|
23
|
+
// excludeDomains: ["istockphoto.com", "alamy.com"]
|
|
20
24
|
});
|
|
21
25
|
|
|
22
26
|
console.log(test, test.result.length);
|