embulk-filter-crawler 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52c2e67425310d86c6695fecbd4eeec2e80bc1f7
|
4
|
+
data.tar.gz: 12615ac3b93eb0eef4dd7c057f2508b070770611
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 691bdc10a98944c09892392eab8187d14209cc00724d235c4fdab2d5d66f202254133da858a23a11371142941eb4bb060dda46960a529d5fb6e1091e4ef2ba9f
|
7
|
+
data.tar.gz: 4c5d5da29cbda4ba32f2450aeea928f3779179634565450214494d9ca9a29de41f63f1df536e15cac6f4ce98c3c1e32242396c2d13df740b00628d8f8b30219a
|
data/build.gradle
CHANGED
@@ -19,6 +19,7 @@ import org.embulk.spi.PageOutput;
|
|
19
19
|
import org.embulk.spi.PageReader;
|
20
20
|
import org.embulk.spi.Schema;
|
21
21
|
import org.embulk.spi.type.Types;
|
22
|
+
import org.slf4j.Logger;
|
22
23
|
|
23
24
|
import com.google.common.base.Optional;
|
24
25
|
import com.google.common.collect.ImmutableList;
|
@@ -34,6 +35,8 @@ import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
|
|
34
35
|
public class CrawlerFilterPlugin
|
35
36
|
implements FilterPlugin
|
36
37
|
{
|
38
|
+
private static final Logger logger = Exec.getLogger(CrawlerFilterPlugin.class);
|
39
|
+
|
37
40
|
public interface PluginTask
|
38
41
|
extends Task
|
39
42
|
{
|
@@ -131,21 +134,10 @@ public class CrawlerFilterPlugin
|
|
131
134
|
return new PageOutput() {
|
132
135
|
private PageReader reader = new PageReader(inputSchema);
|
133
136
|
private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
|
134
|
-
private CrawlController controller = getController();
|
135
137
|
|
136
138
|
@Override
|
137
139
|
public void finish()
|
138
140
|
{
|
139
|
-
for (Object object : controller.getCrawlersLocalData()) {
|
140
|
-
CrawlStat crawlStat = (CrawlStat) object;
|
141
|
-
for (Map<String, Object> map : crawlStat.getPages()) {
|
142
|
-
for (Column outputColumn : outputSchema.getColumns()) {
|
143
|
-
final Object value = map.get(outputColumn.getName());
|
144
|
-
setValue(value, outputColumn);
|
145
|
-
}
|
146
|
-
builder.addRecord();
|
147
|
-
}
|
148
|
-
}
|
149
141
|
builder.finish();
|
150
142
|
}
|
151
143
|
|
@@ -158,6 +150,7 @@ public class CrawlerFilterPlugin
|
|
158
150
|
@Override
|
159
151
|
public void add(Page page)
|
160
152
|
{
|
153
|
+
CrawlController controller = getController();
|
161
154
|
reader.setPage(page);
|
162
155
|
while (reader.nextRecord()) {
|
163
156
|
controller.addSeed(reader.getString(keyNameColumn));
|
@@ -169,6 +162,16 @@ public class CrawlerFilterPlugin
|
|
169
162
|
}
|
170
163
|
controller.setCustomData(customData);
|
171
164
|
controller.start(EmbulkCrawler.class, task.getNumberOfCrawlers());
|
165
|
+
for (Object object : controller.getCrawlersLocalData()) {
|
166
|
+
CrawlStat crawlStat = (CrawlStat) object;
|
167
|
+
for (Map<String, Object> map : crawlStat.getPages()) {
|
168
|
+
for (Column outputColumn : outputSchema.getColumns()) {
|
169
|
+
final Object value = map.get(outputColumn.getName());
|
170
|
+
setValue(value, outputColumn);
|
171
|
+
}
|
172
|
+
builder.addRecord();
|
173
|
+
}
|
174
|
+
}
|
172
175
|
}
|
173
176
|
|
174
177
|
/**
|
@@ -91,7 +91,7 @@ public class EmbulkCrawler extends WebCrawler
|
|
91
91
|
BinaryParseData binaryParseData = (BinaryParseData) parseData;
|
92
92
|
map.put(outputPrefix + Constants.HTML, binaryParseData.getHtml());
|
93
93
|
}
|
94
|
-
logger.
|
94
|
+
logger.debug("{}", webURL.getURL());
|
95
95
|
myCrawlStat.pages.add(map);
|
96
96
|
}
|
97
97
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -73,7 +73,7 @@ files:
|
|
73
73
|
- classpath/commons-logging-1.2.jar
|
74
74
|
- classpath/crawler4j-4.2.jar
|
75
75
|
- classpath/dom4j-1.6.1.jar
|
76
|
-
- classpath/embulk-filter-crawler-0.1.
|
76
|
+
- classpath/embulk-filter-crawler-0.1.2.jar
|
77
77
|
- classpath/fontbox-1.8.4.jar
|
78
78
|
- classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
|
79
79
|
- classpath/httpclient-4.4.jar
|