embulk-filter-crawler 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/filter/crawler/CrawlerFilterPlugin.java +14 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30a79fc0749463b0dce27c96cb1e32f6cdced9f2
|
4
|
+
data.tar.gz: 80db4bafe0d5b0418aa8cf18be52c8cfe66f2e95
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c7df4cb9d0e103a8dc17e34ee90be45da7c659f5a0a0fc06d28215854dc12b026de9e4eaf4b65354c3befcc191c0a60efc33bd3c0c39ba4ac77eb3a0d327760
|
7
|
+
data.tar.gz: fc890159a20819a203eae0ae61eb4cce3a9b8bc11d2e8903a61bd5a59ba1ccf451dd8f58e5e918eeb0d1f0f07fcff2138491c343c1eb7a1b52e01e219816614a
|
data/README.md
CHANGED
@@ -10,13 +10,14 @@ Write short description here and build.gradle file.
|
|
10
10
|
|
11
11
|
- **target_key**: base_url column key name (string, require)
|
12
12
|
- **max_depth_of_crawling**: max depth of crawling (integer, default: unlimited)
|
13
|
-
- **
|
14
|
-
- **
|
15
|
-
- **max_pages_to_fetch**: max_pages_to_fetch (string, default: unlimited)
|
13
|
+
- **number_of_crawlers**: parallelism (integer, default: 1)
|
14
|
+
- **max_pages_to_fetch**: max_pages_to_fetch (integer, default: unlimited)
|
16
15
|
- **crawl_storage_folder**: crawl_storage_folder (string, require)
|
17
16
|
- **politeness_delay**: politeness_delay (integer, default: null)
|
18
17
|
- **user_agent_string**: user_agent_string (string, default: null)
|
19
|
-
- **
|
18
|
+
- **output_prefix**: output_prefix (string, default: "")
|
19
|
+
- **connection_timeout**: connection timeout millisecond (integer, default: 30000)
|
20
|
+
- **socket_timeout**: socket timeout millisecond (integer, default: 20000)
|
20
21
|
|
21
22
|
## Example
|
22
23
|
|
@@ -33,7 +34,6 @@ filters:
|
|
33
34
|
- type: crawler
|
34
35
|
target_key: url
|
35
36
|
number_of_crawlers: 10
|
36
|
-
seed_size: 100
|
37
37
|
max_depth_of_crawling: 4
|
38
38
|
politeness_delay: 100
|
39
39
|
crawl_storage_folder: "/tmp/crawl/%s"
|
data/build.gradle
CHANGED
@@ -74,6 +74,14 @@ public class CrawlerFilterPlugin
|
|
74
74
|
@Config("should_not_visit_pattern")
|
75
75
|
@ConfigDefault("null")
|
76
76
|
public Optional<String> getShouldNotVisitPattern();
|
77
|
+
|
78
|
+
@Config("connection_timeout")
|
79
|
+
@ConfigDefault("null")
|
80
|
+
public Optional<Integer> getConnectionTimeout();
|
81
|
+
|
82
|
+
@Config("socket_timeout")
|
83
|
+
@ConfigDefault("null")
|
84
|
+
public Optional<Integer> getSocketTimeout();
|
77
85
|
}
|
78
86
|
|
79
87
|
@Override
|
@@ -202,6 +210,12 @@ public class CrawlerFilterPlugin
|
|
202
210
|
if (task.getUserAgentString().isPresent()) {
|
203
211
|
config.setUserAgentString(task.getUserAgentString().get());
|
204
212
|
}
|
213
|
+
if (task.getSocketTimeout().isPresent()) {
|
214
|
+
config.setSocketTimeout(task.getSocketTimeout().get());
|
215
|
+
}
|
216
|
+
if (task.getConnectionTimeout().isPresent()) {
|
217
|
+
config.setConnectionTimeout(task.getConnectionTimeout().get());
|
218
|
+
}
|
205
219
|
|
206
220
|
PageFetcher pageFetcher = new PageFetcher(config);
|
207
221
|
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -73,7 +73,7 @@ files:
|
|
73
73
|
- classpath/commons-logging-1.2.jar
|
74
74
|
- classpath/crawler4j-4.2.jar
|
75
75
|
- classpath/dom4j-1.6.1.jar
|
76
|
-
- classpath/embulk-filter-crawler-0.1.
|
76
|
+
- classpath/embulk-filter-crawler-0.1.1.jar
|
77
77
|
- classpath/fontbox-1.8.4.jar
|
78
78
|
- classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
|
79
79
|
- classpath/httpclient-4.4.jar
|