google-img-scrap 1.0.0 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,17 @@
1
+ # Changelog
2
+
3
+ ### v1.0.4
4
+
5
+ - New option ```urlMatch```. You know get image when an url match a string (example: "cdn")
6
+ - New option ```filterByTitles```. Filter images by titles
7
+
8
+ ### v1.0.3
9
+ - New option ```execute```. allow you to execute a function to remove "gstatic.com" domains for example
10
+
11
+ ### v1.0.2
12
+ - Cannot set 'domains' and 'excludeDomains' as same time
13
+ - Fixed some bugs
14
+ - New option ```excludeWords```
15
+
16
+ ### v1.0.1
17
+ - Added the missing dependencie
package/README.md CHANGED
@@ -1,5 +1,10 @@
1
- # Google-img-scrap v1.0.0
2
- Scrap images from google image with lot of options
1
+ # Google-img-scrap v1.0.4
2
+
3
+ Scrap images from google image with lot of tools and options.
4
+
5
+ ## Update
6
+
7
+ - See [changelog](CHANGELOG.md)
3
8
 
4
9
  ## Found a bug ?
5
10
 
@@ -13,21 +18,56 @@ npm i google-img-scrap
13
18
 
14
19
  ## Import
15
20
 
21
+ - NPM
22
+
23
+ ```js
24
+ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
25
+ ```
26
+
27
+ - From GITHUB
28
+
16
29
  ```js
17
- const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
30
+ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('./src/google-img-scrap');
18
31
  ```
19
32
 
33
+ ## Params
34
+
35
+ - "search" (String) what you want to search
36
+ - "execute" (Function) allow you to execute a function to remove "gstatic.com" domains for example
37
+ - "excludeWords" (Array of String) exclude some words from the search
38
+ - "domains" (Array of String) filter by domains
39
+ - "excludeDomains" (Array of String) exclude some domains
40
+ - "safeSearch" (Boolean) active safe search or not for nsfw for example
41
+ - "custom" (String) add extra query
42
+ - "urlMatch" (Array of Array) get image when an url match a string (example: "cdn") | ```example below```
43
+ - "filterByTitles" (Array of Array) filter images by titles | ```example below```
44
+ - "query" (Object) set a query (can be [TYPE, DATE, COLOR, SIZE, LICENCE, EXTENSION]) (use GOOGLE_QUERY items | ```example below```
45
+
20
46
  ## Result
21
47
 
22
48
  ```js
23
- [
49
+ }
50
+ {
51
+ url: 'https://images.google.com/search?tbm=isch&tbs=itp:clipart,qdr:y,ic:gray,isz:l,il:ol,ift:jpg&q=cats',
52
+ result: [
53
+ {
54
+ url: 'https://media.istockphoto.com/vectors/black-cats-set-vector-id599123506',
55
+ height: '806',
56
+ width: '1024'
57
+ },
24
58
  {
25
- url: "...",
26
- width: 1920,
27
- height: 1080
59
+ url: 'https://media.istockphoto.com/vectors/cats-vector-id455327075',
60
+ height: '860',
61
+ width: '1024'
62
+ },
63
+ {
64
+ url: 'https://media.istockphoto.com/vectors/purring-cats-vector-silhouette-vector-id165749810?s=2048x2048',
65
+ height: '1895',
66
+ width: '2048'
28
67
  },
29
68
  ...
30
- ]
69
+ ]
70
+ }
31
71
  ```
32
72
 
33
73
  ## How to use ?
@@ -35,7 +75,9 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
35
75
  - For the query parameter you need to set the name in upper case !
36
76
 
37
77
  ```js
38
- const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
78
+ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
79
+
80
+ console.log(GOOGLE_QUERY);
39
81
 
40
82
  (async function(){
41
83
  const test = await GOOGLE_IMG_SCRAP({
@@ -48,10 +90,11 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
48
90
  LICENCE: GOOGLE_QUERY.LICENCE.COMMERCIAL_AND_OTHER,
49
91
  EXTENSION: GOOGLE_QUERY.EXTENSION.JPG
50
92
  },
51
- domains: [],
93
+ domains: ["alamy.com", "istockphoto.com", "vecteezy.com"],
94
+ excludeWords: ["black", "white"], //If you don't like black and white cats
52
95
  custom: "name=content&name2=content2",
53
96
  safeSearch: false,
54
- excludeDomains: []
97
+ // excludeDomains: ["istockphoto.com", "alamy.com"]
55
98
  });
56
99
 
57
100
  console.log(test, test.result.length);
@@ -61,7 +104,7 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
61
104
  OR ALSO
62
105
 
63
106
  ```js
64
- const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
107
+ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
65
108
 
66
109
  (async function(){
67
110
  const test = await GOOGLE_IMG_SCRAP({
@@ -71,14 +114,52 @@ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
71
114
  console.log(test, test.result.length);
72
115
  })();
73
116
  ```
74
- ## Params
75
117
 
76
- - "search" what you want to search
77
- - "domains" filter by domains
78
- - "excludeDomains" exclude some domains
79
- - "safeSearch" active safe search or not
80
- - "custom" add extra query
81
- - "query" set a query (can be [TYPE, DATE, COLOR, SIZE, LICENCE, EXTENSION])
118
+ ## Removing gstatic.com
119
+
120
+ ```js
121
+ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('google-img-scrap');
122
+
123
+ (async function(){
124
+ const test = await GOOGLE_IMG_SCRAP({
125
+ search: "demon slayer background hd",
126
+ query: {
127
+ SIZE: GOOGLE_QUERY.SIZE.LARGE,
128
+ },
129
+ domains: ["alphacoders.com"],
130
+ safeSearch: false,
131
+ execute: function(element){
132
+ if(!element.url.match('gstatic.com')) return element;
133
+ }
134
+ });
135
+
136
+ console.log(test, test.result[test.result.length-1].url, test.result.length);
137
+ })();
138
+ ```
139
+
140
+ ## How urlMatch and filterByTitles work ?
141
+
142
+ - urlMatch work like filterByTiles
143
+
144
+ ```js
145
+ const { GOOGLE_IMG_SCRAP } = require('google-img-scrap');
146
+
147
+ (async function(){
148
+ const test = await GOOGLE_IMG_SCRAP({
149
+ search: "cats",
150
+ //will build something like this "(draw and white) or (albino and white)"
151
+ filterByTitles: [
152
+ ["draw", "white"],
153
+ ["albino", "white"]
154
+ ],
155
+ execute: function(element){
156
+ if(!element.url.match('gstatic.com')) return element;
157
+ }
158
+ });
159
+
160
+ console.log(test, test.result.length);
161
+ })();
162
+ ```
82
163
 
83
164
  ## Google query
84
165
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "google-img-scrap",
3
- "version": "1.0.0",
3
+ "version": "1.0.4",
4
4
  "description": "Scrap images from google images with a lot of options",
5
5
  "main": "./src/google-img-scrap.js",
6
6
  "directories": {
@@ -35,6 +35,7 @@
35
35
  },
36
36
  "homepage": "https://github.com/yoannchb-pro/google-img-scrap#readme",
37
37
  "dependencies": {
38
+ "fast-html-dom-parser": "^1.0.5",
38
39
  "got": "^11.0.0"
39
40
  }
40
41
  }
@@ -10,6 +10,8 @@ const { buildQuery, unicodeToChar } = require('./utils/UTILS');
10
10
 
11
11
  //verify good configuration
12
12
  function verify(config){
13
+ if(config.excludeDomains && config.domains) throw "Can not set 'excludeDomains' and 'domains' as same times";
14
+
13
15
  if(!config.search || config.search.trim() == "") throw "'search' can not be empty";
14
16
 
15
17
  if(config.query){
@@ -77,16 +79,52 @@ async function GOOGLE_IMG_SCRAP(config = {}){
77
79
 
78
80
  //exclude domains
79
81
  const EXCLUDE_DOMAINS = [];
80
- if(config.excludeDomains) config.excludeDomains.forEach((domain) => EXCLUDE_DOMAINS.push(`-site:${domain}`));
82
+ if(config.excludeDomains) config.excludeDomains.forEach((domain) => EXCLUDE_DOMAINS.push(`-site:"${domain}"`));
81
83
 
82
84
  //domains
83
85
  const DOMAINS = [];
84
86
  if(config.domains) config.domains.forEach((domain) => DOMAINS.push(`site:"${domain}"`));
85
87
 
88
+ //exclude words
89
+ const EXCLUDE_WORDS = [];
90
+ if(config.excludeWords) config.excludeWords.forEach((word) => EXCLUDE_WORDS.push(`-"${word}"`));
91
+
92
+ //filter by titles
93
+ const FILTER_TITLE = [];
94
+ if(config.filterByTitles) config.filterByTitles.forEach((titleFilter) => {
95
+
96
+ const value = titleFilter.map((title) => {
97
+ return `intitle:"${title}"`;
98
+ });
99
+
100
+ FILTER_TITLE.push(`(${value.join(' AND ')})`);
101
+
102
+ });
103
+
104
+ //url match words
105
+ const URL_MATCH = [];
106
+ if(config.urlMatch) config.urlMatch.forEach((urlMatch) => {
107
+
108
+ const value = urlMatch.map((content) => {
109
+ return `inurl:${content}`;
110
+ });
111
+
112
+ URL_MATCH.push(`(${value.join(' AND ')})`);
113
+
114
+ });
115
+
86
116
  //building url
117
+ const SEARCH_TERM = config.search +
118
+ " " + URL_MATCH.join(" OR ") +
119
+ " " + FILTER_TITLE.join(" OR ") +
120
+ " " + EXCLUDE_WORDS.join(" ") +
121
+ " " + EXCLUDE_DOMAINS.join(" ") +
122
+ " " + DOMAINS.join(" OR ");
123
+
124
+ const SEARCH = encodeURIComponent(SEARCH_TERM.trim())
87
125
  const QUERY = Object.assign(GOOGLE_CONSTANT.forceGoogleImage, {
88
126
  [GOOGLE_CONSTANT.queryParam]: Object.values(config.query || {}).join(','),
89
- q: encodeURIComponent(config.search + " " + EXCLUDE_DOMAINS.join(" ") + " " + DOMAINS.join(' OR ')),
127
+ q: SEARCH,
90
128
  });
91
129
 
92
130
  const CUSTOM_PARAM = config.custom ? `&${config.custom}` : "";
@@ -97,8 +135,16 @@ async function GOOGLE_IMG_SCRAP(config = {}){
97
135
  //parsing
98
136
  const result = await parse(URL);
99
137
 
138
+ //excute function
139
+ let finalResult = [];
140
+ if(config.execute) result.forEach((element) => {
141
+ const value = config.execute(element);
142
+ if(value) finalResult.push(value);
143
+ });
144
+ else finalResult = result;
145
+
100
146
  //result
101
- return {url: URL, result: result};
147
+ return {url: URL, result: finalResult};
102
148
  };
103
149
 
104
150
  module.exports = { GOOGLE_IMG_SCRAP , GOOGLE_QUERY };
@@ -0,0 +1,16 @@
1
+ const { GOOGLE_IMG_SCRAP } = require('../src/google-img-scrap');
2
+
3
+ (async function(){
4
+ const test = await GOOGLE_IMG_SCRAP({
5
+ search: "cats",
6
+ filterByTitles: [
7
+ ["draw", "white"],
8
+ ["albino", "white"]
9
+ ],
10
+ execute: function(element){
11
+ if(!element.url.match('gstatic.com')) return element;
12
+ }
13
+ });
14
+
15
+ console.log(test, test.result.length);
16
+ })();
@@ -0,0 +1,16 @@
1
+ const { GOOGLE_IMG_SCRAP } = require('../src/google-img-scrap');
2
+
3
+ (async function(){
4
+ const test = await GOOGLE_IMG_SCRAP({
5
+ search: "cats",
6
+ urlMatch: [
7
+ ["cdn"],
8
+ ["istockphoto"]
9
+ ],
10
+ execute: function(element){
11
+ if(!element.url.match('gstatic.com')) return element;
12
+ }
13
+ });
14
+
15
+ console.log(test, test.result.length);
16
+ })();
@@ -0,0 +1,17 @@
1
+ const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
2
+
3
+ (async function(){
4
+ const test = await GOOGLE_IMG_SCRAP({
5
+ search: "demon slayer background hd",
6
+ query: {
7
+ SIZE: GOOGLE_QUERY.SIZE.LARGE,
8
+ },
9
+ domains: ["alphacoders.com"],
10
+ safeSearch: false,
11
+ execute: function(element){
12
+ if(!element.url.match('gstatic.com')) return element;
13
+ }
14
+ });
15
+
16
+ console.log(test, test.result[test.result.length-1].url, test.result.length);
17
+ })();
package/test/test.js CHANGED
@@ -1,6 +1,6 @@
1
1
  const { GOOGLE_IMG_SCRAP , GOOGLE_QUERY } = require('../src/google-img-scrap');
2
2
 
3
- console.log(GOOGLE_QUERY);
3
+ // console.log(GOOGLE_QUERY);
4
4
 
5
5
  (async function(){
6
6
  const test = await GOOGLE_IMG_SCRAP({
@@ -13,10 +13,14 @@ console.log(GOOGLE_QUERY);
13
13
  LICENCE: GOOGLE_QUERY.LICENCE.COMMERCIAL_AND_OTHER,
14
14
  EXTENSION: GOOGLE_QUERY.EXTENSION.JPG
15
15
  },
16
- domains: [],
16
+ domains: ["alamy.com", "istockphoto.com", "vecteezy.com"],
17
+ excludeWords: ["black", "white"], //If you don't like black and white cats
17
18
  custom: "name=content&name2=content2",
18
19
  safeSearch: false,
19
- excludeDomains: []
20
+ execute: function(element){
21
+ if(!element.url.match('gstatic.com')) return element;
22
+ }
23
+ // excludeDomains: ["istockphoto.com", "alamy.com"]
20
24
  });
21
25
 
22
26
  console.log(test, test.result.length);