embulk-filter-crawler 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52c2e67425310d86c6695fecbd4eeec2e80bc1f7
|
4
|
+
data.tar.gz: 12615ac3b93eb0eef4dd7c057f2508b070770611
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 691bdc10a98944c09892392eab8187d14209cc00724d235c4fdab2d5d66f202254133da858a23a11371142941eb4bb060dda46960a529d5fb6e1091e4ef2ba9f
|
7
|
+
data.tar.gz: 4c5d5da29cbda4ba32f2450aeea928f3779179634565450214494d9ca9a29de41f63f1df536e15cac6f4ce98c3c1e32242396c2d13df740b00628d8f8b30219a
|
data/build.gradle
CHANGED
@@ -19,6 +19,7 @@ import org.embulk.spi.PageOutput;
|
|
19
19
|
import org.embulk.spi.PageReader;
|
20
20
|
import org.embulk.spi.Schema;
|
21
21
|
import org.embulk.spi.type.Types;
|
22
|
+
import org.slf4j.Logger;
|
22
23
|
|
23
24
|
import com.google.common.base.Optional;
|
24
25
|
import com.google.common.collect.ImmutableList;
|
@@ -34,6 +35,8 @@ import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
|
|
34
35
|
public class CrawlerFilterPlugin
|
35
36
|
implements FilterPlugin
|
36
37
|
{
|
38
|
+
private static final Logger logger = Exec.getLogger(CrawlerFilterPlugin.class);
|
39
|
+
|
37
40
|
public interface PluginTask
|
38
41
|
extends Task
|
39
42
|
{
|
@@ -131,21 +134,10 @@ public class CrawlerFilterPlugin
|
|
131
134
|
return new PageOutput() {
|
132
135
|
private PageReader reader = new PageReader(inputSchema);
|
133
136
|
private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
134
|
-
private CrawlController controller = getController();
|
135
137
|
|
136
138
|
@Override
|
137
139
|
public void finish()
|
138
140
|
{
|
139
|
-
for (Object object : controller.getCrawlersLocalData()) {
|
140
|
-
CrawlStat crawlStat = (CrawlStat) object;
|
141
|
-
for (Map<String, Object> map : crawlStat.getPages()) {
|
142
|
-
for (Column outputColumn : outputSchema.getColumns()) {
|
143
|
-
final Object value = map.get(outputColumn.getName());
|
144
|
-
setValue(value, outputColumn);
|
145
|
-
}
|
146
|
-
builder.addRecord();
|
147
|
-
}
|
148
|
-
}
|
149
141
|
builder.finish();
|
150
142
|
}
|
151
143
|
|
@@ -158,6 +150,7 @@ public class CrawlerFilterPlugin
|
|
158
150
|
@Override
|
159
151
|
public void add(Page page)
|
160
152
|
{
|
153
|
+
CrawlController controller = getController();
|
161
154
|
reader.setPage(page);
|
162
155
|
while (reader.nextRecord()) {
|
163
156
|
controller.addSeed(reader.getString(keyNameColumn));
|
@@ -169,6 +162,16 @@ public class CrawlerFilterPlugin
|
|
169
162
|
}
|
170
163
|
controller.setCustomData(customData);
|
171
164
|
controller.start(EmbulkCrawler.class, task.getNumberOfCrawlers());
|
165
|
+
for (Object object : controller.getCrawlersLocalData()) {
|
166
|
+
CrawlStat crawlStat = (CrawlStat) object;
|
167
|
+
for (Map<String, Object> map : crawlStat.getPages()) {
|
168
|
+
for (Column outputColumn : outputSchema.getColumns()) {
|
169
|
+
final Object value = map.get(outputColumn.getName());
|
170
|
+
setValue(value, outputColumn);
|
171
|
+
}
|
172
|
+
builder.addRecord();
|
173
|
+
}
|
174
|
+
}
|
172
175
|
}
|
173
176
|
|
174
177
|
/**
|
@@ -91,7 +91,7 @@ public class EmbulkCrawler extends WebCrawler
|
|
91
91
|
BinaryParseData binaryParseData = (BinaryParseData) parseData;
|
92
92
|
map.put(outputPrefix + Constants.HTML, binaryParseData.getHtml());
|
93
93
|
}
|
94
|
-
logger.
|
94
|
+
logger.debug("{}", webURL.getURL());
|
95
95
|
myCrawlStat.pages.add(map);
|
96
96
|
}
|
97
97
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -73,7 +73,7 @@ files:
|
|
73
73
|
- classpath/commons-logging-1.2.jar
|
74
74
|
- classpath/crawler4j-4.2.jar
|
75
75
|
- classpath/dom4j-1.6.1.jar
|
76
|
-
- classpath/embulk-filter-crawler-0.1.
|
76
|
+
- classpath/embulk-filter-crawler-0.1.2.jar
|
77
77
|
- classpath/fontbox-1.8.4.jar
|
78
78
|
- classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
|
79
79
|
- classpath/httpclient-4.4.jar
|