embulk-filter-crawler 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/filter/crawler/CrawlerFilterPlugin.java +14 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30a79fc0749463b0dce27c96cb1e32f6cdced9f2
|
4
|
+
data.tar.gz: 80db4bafe0d5b0418aa8cf18be52c8cfe66f2e95
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c7df4cb9d0e103a8dc17e34ee90be45da7c659f5a0a0fc06d28215854dc12b026de9e4eaf4b65354c3befcc191c0a60efc33bd3c0c39ba4ac77eb3a0d327760
|
7
|
+
data.tar.gz: fc890159a20819a203eae0ae61eb4cce3a9b8bc11d2e8903a61bd5a59ba1ccf451dd8f58e5e918eeb0d1f0f07fcff2138491c343c1eb7a1b52e01e219816614a
|
data/README.md
CHANGED
@@ -10,13 +10,14 @@ Write short description here and build.gradle file.
|
|
10
10
|
|
11
11
|
- **target_key**: base_url column key name (string, require)
|
12
12
|
- **max_depth_of_crawling**: max depth of crawling (integer, default: unlimited)
|
13
|
-
- **
|
14
|
-
- **
|
15
|
-
- **max_pages_to_fetch**: max_pages_to_fetch (string, default: unlimited)
|
13
|
+
- **number_of_crawlers**: parallelism (integer, default: 1)
|
14
|
+
- **max_pages_to_fetch**: max_pages_to_fetch (integer, default: unlimited)
|
16
15
|
- **crawl_storage_folder**: crawl_storage_folder (string, require)
|
17
16
|
- **politeness_delay**: politeness_delay (integer, default: null)
|
18
17
|
- **user_agent_string**: user_agent_string (string, default: null)
|
19
|
-
- **
|
18
|
+
- **output_prefix**: output_prefix (string, default: "")
|
19
|
+
- **connection_timeout**: connection timeout millisecond (integer, default: 30000)
|
20
|
+
- **socket_timeout**: socket timeout millisecond (integer, default: 20000)
|
20
21
|
|
21
22
|
## Example
|
22
23
|
|
@@ -33,7 +34,6 @@ filters:
|
|
33
34
|
- type: crawler
|
34
35
|
target_key: url
|
35
36
|
number_of_crawlers: 10
|
36
|
-
seed_size: 100
|
37
37
|
max_depth_of_crawling: 4
|
38
38
|
politeness_delay: 100
|
39
39
|
crawl_storage_folder: "/tmp/crawl/%s"
|
data/build.gradle
CHANGED
@@ -74,6 +74,14 @@ public class CrawlerFilterPlugin
|
|
74
74
|
@Config("should_not_visit_pattern")
|
75
75
|
@ConfigDefault("null")
|
76
76
|
public Optional<String> getShouldNotVisitPattern();
|
77
|
+
|
78
|
+
@Config("connection_timeout")
|
79
|
+
@ConfigDefault("null")
|
80
|
+
public Optional<Integer> getConnectionTimeout();
|
81
|
+
|
82
|
+
@Config("socket_timeout")
|
83
|
+
@ConfigDefault("null")
|
84
|
+
public Optional<Integer> getSocketTimeout();
|
77
85
|
}
|
78
86
|
|
79
87
|
@Override
|
@@ -202,6 +210,12 @@ public class CrawlerFilterPlugin
|
|
202
210
|
if (task.getUserAgentString().isPresent()) {
|
203
211
|
config.setUserAgentString(task.getUserAgentString().get());
|
204
212
|
}
|
213
|
+
if (task.getSocketTimeout().isPresent()) {
|
214
|
+
config.setSocketTimeout(task.getSocketTimeout().get());
|
215
|
+
}
|
216
|
+
if (task.getConnectionTimeout().isPresent()) {
|
217
|
+
config.setConnectionTimeout(task.getConnectionTimeout().get());
|
218
|
+
}
|
205
219
|
|
206
220
|
PageFetcher pageFetcher = new PageFetcher(config);
|
207
221
|
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -73,7 +73,7 @@ files:
|
|
73
73
|
- classpath/commons-logging-1.2.jar
|
74
74
|
- classpath/crawler4j-4.2.jar
|
75
75
|
- classpath/dom4j-1.6.1.jar
|
76
|
-
- classpath/embulk-filter-crawler-0.1.
|
76
|
+
- classpath/embulk-filter-crawler-0.1.1.jar
|
77
77
|
- classpath/fontbox-1.8.4.jar
|
78
78
|
- classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
|
79
79
|
- classpath/httpclient-4.4.jar
|